summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/Kconfig168
-rw-r--r--arch/arm64/Kconfig.debug29
-rw-r--r--arch/arm64/Kconfig.platforms3
-rw-r--r--arch/arm64/Makefile16
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts2
-rw-r--r--arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi18
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi2
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-g12.dtsi1
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi4
-rw-r--r--arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts2
-rw-r--r--arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi2
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mm.dtsi8
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mn.dtsi10
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h46
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mp.dtsi6
-rw-r--r--arch/arm64/boot/dts/freescale/imx8mq.dtsi8
-rw-r--r--arch/arm64/boot/dts/mediatek/mt8173.dtsi4
-rw-r--r--arch/arm64/boot/dts/mediatek/mt8516.dtsi17
-rw-r--r--arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi34
-rw-r--r--arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi23
-rw-r--r--arch/arm64/boot/dts/qcom/msm8996.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/sdm845-db845c.dts3
-rw-r--r--arch/arm64/boot/dts/qcom/sdm845.dtsi2
-rw-r--r--arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77970-eagle.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77980-condor.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77980.dtsi2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts2
-rw-r--r--arch/arm64/boot/dts/renesas/r8a77995-draak.dts6
-rw-r--r--arch/arm64/boot/dts/rockchip/px30.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3308.dtsi2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3328-evb.dts5
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3328-rock64.dts2
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3328.dtsi18
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts9
-rw-r--r--arch/arm64/boot/dts/rockchip/rk3399.dtsi14
-rw-r--r--arch/arm64/boot/dts/ti/k3-am65-main.dtsi22
-rw-r--r--arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi21
-rw-r--r--arch/arm64/boot/dts/ti/k3-j721e-main.dtsi12
-rw-r--r--arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi11
-rw-r--r--arch/arm64/configs/defconfig28
-rw-r--r--arch/arm64/crypto/aes-glue.c4
-rw-r--r--arch/arm64/crypto/chacha-neon-glue.c14
-rw-r--r--arch/arm64/crypto/crct10dif-ce-core.S2
-rw-r--r--arch/arm64/crypto/nhpoly1305-neon-glue.c2
-rw-r--r--arch/arm64/crypto/poly1305-glue.c15
-rw-r--r--arch/arm64/crypto/sha256-glue.c1
-rw-r--r--arch/arm64/crypto/sha512-glue.c1
-rw-r--r--arch/arm64/include/asm/asm_pointer_auth.h43
-rw-r--r--arch/arm64/include/asm/assembler.h50
-rw-r--r--arch/arm64/include/asm/cacheflush.h6
-rw-r--r--arch/arm64/include/asm/compiler.h4
-rw-r--r--arch/arm64/include/asm/cpu.h4
-rw-r--r--arch/arm64/include/asm/cpucaps.h17
-rw-r--r--arch/arm64/include/asm/cpufeature.h30
-rw-r--r--arch/arm64/include/asm/debug-monitors.h2
-rw-r--r--arch/arm64/include/asm/efi.h8
-rw-r--r--arch/arm64/include/asm/elf.h50
-rw-r--r--arch/arm64/include/asm/esr.h2
-rw-r--r--arch/arm64/include/asm/exception.h1
-rw-r--r--arch/arm64/include/asm/hardirq.h78
-rw-r--r--arch/arm64/include/asm/hugetlb.h13
-rw-r--r--arch/arm64/include/asm/hwcap.h1
-rw-r--r--arch/arm64/include/asm/insn.h30
-rw-r--r--arch/arm64/include/asm/kvm_asm.h4
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h6
-rw-r--r--arch/arm64/include/asm/kvm_host.h52
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h32
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h33
-rw-r--r--arch/arm64/include/asm/linkage.h46
-rw-r--r--arch/arm64/include/asm/mman.h37
-rw-r--r--arch/arm64/include/asm/pgalloc.h10
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h2
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h11
-rw-r--r--arch/arm64/include/asm/pgtable-types.h5
-rw-r--r--arch/arm64/include/asm/pgtable.h49
-rw-r--r--arch/arm64/include/asm/ptrace.h1
-rw-r--r--arch/arm64/include/asm/scs.h29
-rw-r--r--arch/arm64/include/asm/smp.h11
-rw-r--r--arch/arm64/include/asm/stacktrace.h40
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h48
-rw-r--r--arch/arm64/include/asm/suspend.h2
-rw-r--r--arch/arm64/include/asm/sysreg.h77
-rw-r--r--arch/arm64/include/asm/thread_info.h13
-rw-r--r--arch/arm64/include/asm/uaccess.h2
-rw-r--r--arch/arm64/include/asm/unistd.h2
-rw-r--r--arch/arm64/include/asm/unistd32.h2
-rw-r--r--arch/arm64/include/asm/virt.h2
-rw-r--r--arch/arm64/include/asm/vmap_stack.h6
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h1
-rw-r--r--arch/arm64/include/uapi/asm/mman.h9
-rw-r--r--arch/arm64/include/uapi/asm/ptrace.h9
-rw-r--r--arch/arm64/kernel/Makefile1
-rw-r--r--arch/arm64/kernel/acpi.c25
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c2
-rw-r--r--arch/arm64/kernel/asm-offsets.c9
-rw-r--r--arch/arm64/kernel/cpu-reset.S4
-rw-r--r--arch/arm64/kernel/cpu_errata.c31
-rw-r--r--arch/arm64/kernel/cpufeature.c455
-rw-r--r--arch/arm64/kernel/cpuinfo.c9
-rw-r--r--arch/arm64/kernel/crash_core.c4
-rw-r--r--arch/arm64/kernel/debug-monitors.c4
-rw-r--r--arch/arm64/kernel/efi-entry.S4
-rw-r--r--arch/arm64/kernel/efi-header.S4
-rw-r--r--arch/arm64/kernel/efi-rt-wrapper.S15
-rw-r--r--arch/arm64/kernel/entry-common.c13
-rw-r--r--arch/arm64/kernel/entry-fpsimd.S20
-rw-r--r--arch/arm64/kernel/entry-ftrace.S5
-rw-r--r--arch/arm64/kernel/entry.S69
-rw-r--r--arch/arm64/kernel/fpsimd.c3
-rw-r--r--arch/arm64/kernel/head.S49
-rw-r--r--arch/arm64/kernel/hibernate-asm.S16
-rw-r--r--arch/arm64/kernel/hibernate.c44
-rw-r--r--arch/arm64/kernel/hyp-stub.S20
-rw-r--r--arch/arm64/kernel/image-vars.h2
-rw-r--r--arch/arm64/kernel/insn.c46
-rw-r--r--arch/arm64/kernel/machine_kexec.c1
-rw-r--r--arch/arm64/kernel/machine_kexec_file.c14
-rw-r--r--arch/arm64/kernel/paravirt.c2
-rw-r--r--arch/arm64/kernel/probes/decode-insn.c2
-rw-r--r--arch/arm64/kernel/probes/kprobes_trampoline.S4
-rw-r--r--arch/arm64/kernel/process.c41
-rw-r--r--arch/arm64/kernel/ptrace.c9
-rw-r--r--arch/arm64/kernel/reloc_test_syms.S44
-rw-r--r--arch/arm64/kernel/relocate_kernel.S4
-rw-r--r--arch/arm64/kernel/scs.c16
-rw-r--r--arch/arm64/kernel/sdei.c42
-rw-r--r--arch/arm64/kernel/signal.c16
-rw-r--r--arch/arm64/kernel/sleep.S13
-rw-r--r--arch/arm64/kernel/smccc-call.S8
-rw-r--r--arch/arm64/kernel/smp.c14
-rw-r--r--arch/arm64/kernel/syscall.c18
-rw-r--r--arch/arm64/kernel/traps.c141
-rw-r--r--arch/arm64/kernel/vdso.c155
-rw-r--r--arch/arm64/kernel/vdso/Makefile12
-rw-r--r--arch/arm64/kernel/vdso/note.S3
-rw-r--r--arch/arm64/kernel/vdso/sigreturn.S54
-rw-r--r--arch/arm64/kernel/vdso/vdso.S3
-rw-r--r--arch/arm64/kernel/vdso32/sigreturn.S19
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S18
-rw-r--r--arch/arm64/kvm/Kconfig22
-rw-r--r--arch/arm64/kvm/Makefile46
-rw-r--r--arch/arm64/kvm/aarch32.c204
-rw-r--r--arch/arm64/kvm/arch_timer.c1171
-rw-r--r--arch/arm64/kvm/arm.c1710
-rw-r--r--arch/arm64/kvm/guest.c36
-rw-r--r--arch/arm64/kvm/handle_exit.c2
-rw-r--r--arch/arm64/kvm/hyp/Makefile16
-rw-r--r--arch/arm64/kvm/hyp/aarch32.c140
-rw-r--r--arch/arm64/kvm/hyp/entry.S23
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S1
-rw-r--r--arch/arm64/kvm/hyp/switch.c14
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c23
-rw-r--r--arch/arm64/kvm/hyp/timer-sr.c48
-rw-r--r--arch/arm64/kvm/hyp/tlb.c11
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c1113
-rw-r--r--arch/arm64/kvm/hypercalls.c71
-rw-r--r--arch/arm64/kvm/inject_fault.c75
-rw-r--r--arch/arm64/kvm/mmio.c200
-rw-r--r--arch/arm64/kvm/mmu.c2612
-rw-r--r--arch/arm64/kvm/perf.c57
-rw-r--r--arch/arm64/kvm/pmu-emul.c869
-rw-r--r--arch/arm64/kvm/psci.c564
-rw-r--r--arch/arm64/kvm/pvtime.c131
-rw-r--r--arch/arm64/kvm/reset.c92
-rw-r--r--arch/arm64/kvm/sys_regs.c218
-rw-r--r--arch/arm64/kvm/trace.h216
-rw-r--r--arch/arm64/kvm/trace_arm.h378
-rw-r--r--arch/arm64/kvm/trace_handle_exit.h215
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c2
-rw-r--r--arch/arm64/kvm/vgic/trace.h38
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c300
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c556
-rw-r--r--arch/arm64/kvm/vgic/vgic-irqfd.c141
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c2783
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c741
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c550
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c1063
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c1088
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.h227
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c504
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c693
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c453
-rw-r--r--arch/arm64/kvm/vgic/vgic.c1020
-rw-r--r--arch/arm64/kvm/vgic/vgic.h321
-rw-r--r--arch/arm64/lib/copy_from_user.S32
-rw-r--r--arch/arm64/lib/copy_in_user.S32
-rw-r--r--arch/arm64/lib/copy_to_user.S32
-rw-r--r--arch/arm64/lib/crc32.S2
-rw-r--r--arch/arm64/lib/memcpy.S32
-rw-r--r--arch/arm64/mm/context.c8
-rw-r--r--arch/arm64/mm/dump.c7
-rw-r--r--arch/arm64/mm/fault.c21
-rw-r--r--arch/arm64/mm/hugetlbpage.c47
-rw-r--r--arch/arm64/mm/init.c58
-rw-r--r--arch/arm64/mm/kasan_init.c26
-rw-r--r--arch/arm64/mm/mmu.c76
-rw-r--r--arch/arm64/mm/numa.c9
-rw-r--r--arch/arm64/mm/pageattr.c11
-rw-r--r--arch/arm64/mm/proc.S60
-rw-r--r--arch/arm64/net/bpf_jit.h30
-rw-r--r--arch/arm64/net/bpf_jit_comp.c85
204 files changed, 22650 insertions, 1678 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 40fb05d96c60..7f9d38444d6d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -9,7 +9,10 @@ config ARM64
select ACPI_MCFG if (ACPI && PCI)
select ACPI_SPCR_TABLE if ACPI
select ACPI_PPTT if ACPI
+ select ARCH_HAS_DEBUG_WX
+ select ARCH_BINFMT_ELF_STATE
select ARCH_HAS_DEBUG_VIRTUAL
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
@@ -20,6 +23,7 @@ config ARM64
select ARCH_HAS_KCOV
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SETUP_DMA_OPS
@@ -32,6 +36,7 @@ config ARM64
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+ select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK if !PREEMPTION
select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
@@ -61,9 +66,12 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_USE_GNU_PROPERTY
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_USE_SYM_ANNOTATIONS
select ARCH_SUPPORTS_MEMORY_FAILURE
+ select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG)
select ARCH_SUPPORTS_NUMA_BALANCING
@@ -156,7 +164,6 @@ config ARM64
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IRQ_TIME_ACCOUNTING
- select HAVE_MEMBLOCK_NODE_MAP if NUMA
select HAVE_NMI
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
@@ -524,13 +531,13 @@ config ARM64_ERRATUM_1418040
If unsure, say Y.
-config ARM64_WORKAROUND_SPECULATIVE_AT_VHE
+config ARM64_WORKAROUND_SPECULATIVE_AT
bool
config ARM64_ERRATUM_1165522
- bool "Cortex-A76: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
+ bool "Cortex-A76: 1165522: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
default y
- select ARM64_WORKAROUND_SPECULATIVE_AT_VHE
+ select ARM64_WORKAROUND_SPECULATIVE_AT
help
This option adds a workaround for ARM Cortex-A76 erratum 1165522.
@@ -540,10 +547,23 @@ config ARM64_ERRATUM_1165522
If unsure, say Y.
+config ARM64_ERRATUM_1319367
+ bool "Cortex-A57/A72: 1319537: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
+ default y
+ select ARM64_WORKAROUND_SPECULATIVE_AT
+ help
+ This option adds work arounds for ARM Cortex-A57 erratum 1319537
+ and A72 erratum 1319367
+
+ Cortex-A57 and A72 cores could end-up with corrupted TLBs by
+ speculating an AT instruction during a guest context switch.
+
+ If unsure, say Y.
+
config ARM64_ERRATUM_1530923
- bool "Cortex-A55: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
+ bool "Cortex-A55: 1530923: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
default y
- select ARM64_WORKAROUND_SPECULATIVE_AT_VHE
+ select ARM64_WORKAROUND_SPECULATIVE_AT
help
This option adds a workaround for ARM Cortex-A55 erratum 1530923.
@@ -553,6 +573,9 @@ config ARM64_ERRATUM_1530923
If unsure, say Y.
+config ARM64_WORKAROUND_REPEAT_TLBI
+ bool
+
config ARM64_ERRATUM_1286807
bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation"
default y
@@ -569,22 +592,6 @@ config ARM64_ERRATUM_1286807
invalidated has been observed by other observers. The
workaround repeats the TLBI+DSB operation.
-config ARM64_WORKAROUND_SPECULATIVE_AT_NVHE
- bool
-
-config ARM64_ERRATUM_1319367
- bool "Cortex-A57/A72: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation"
- default y
- select ARM64_WORKAROUND_SPECULATIVE_AT_NVHE
- help
- This option adds work arounds for ARM Cortex-A57 erratum 1319537
- and A72 erratum 1319367
-
- Cortex-A57 and A72 cores could end-up with corrupted TLBs by
- speculating an AT instruction during a guest context switch.
-
- If unsure, say Y.
-
config ARM64_ERRATUM_1463225
bool "Cortex-A76: Software Step might prevent interrupt recognition"
default y
@@ -694,6 +701,35 @@ config CAVIUM_TX2_ERRATUM_219
If unsure, say Y.
+config FUJITSU_ERRATUM_010001
+ bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly"
+ default y
+ help
+ This option adds a workaround for Fujitsu-A64FX erratum E#010001.
+ On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory
+ accesses may cause undefined fault (Data abort, DFSC=0b111111).
+ This fault occurs under a specific hardware condition when a
+ load/store instruction performs an address translation using:
+ case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1.
+ case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1.
+ case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1.
+ case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1.
+
+ The workaround is to ensure these bits are clear in TCR_ELx.
+ The workaround only affects the Fujitsu-A64FX.
+
+ If unsure, say Y.
+
+config HISILICON_ERRATUM_161600802
+ bool "Hip07 161600802: Erroneous redistributor VLPI base"
+ default y
+ help
+ The HiSilicon Hip07 SoC uses the wrong redistributor base
+ when issued ITS commands such as VMOVP and VMAPP, and requires
+ a 128kB offset to be applied to the target address in this commands.
+
+ If unsure, say Y.
+
config QCOM_FALKOR_ERRATUM_1003
bool "Falkor E1003: Incorrect translation due to ASID change"
default y
@@ -705,9 +741,6 @@ config QCOM_FALKOR_ERRATUM_1003
is unchanged. Work around the erratum by invalidating the walk cache
entries for the trampoline before entering the kernel proper.
-config ARM64_WORKAROUND_REPEAT_TLBI
- bool
-
config QCOM_FALKOR_ERRATUM_1009
bool "Falkor E1009: Prematurely complete a DSB after a TLBI"
default y
@@ -729,25 +762,6 @@ config QCOM_QDF2400_ERRATUM_0065
If unsure, say Y.
-config SOCIONEXT_SYNQUACER_PREITS
- bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
- default y
- help
- Socionext Synquacer SoCs implement a separate h/w block to generate
- MSI doorbell writes with non-zero values for the device ID.
-
- If unsure, say Y.
-
-config HISILICON_ERRATUM_161600802
- bool "Hip07 161600802: Erroneous redistributor VLPI base"
- default y
- help
- The HiSilicon Hip07 SoC uses the wrong redistributor base
- when issued ITS commands such as VMOVP and VMAPP, and requires
- a 128kB offset to be applied to the target address in this commands.
-
- If unsure, say Y.
-
config QCOM_FALKOR_ERRATUM_E1041
bool "Falkor E1041: Speculative instruction fetches might cause errant memory access"
default y
@@ -758,22 +772,12 @@ config QCOM_FALKOR_ERRATUM_E1041
If unsure, say Y.
-config FUJITSU_ERRATUM_010001
- bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly"
+config SOCIONEXT_SYNQUACER_PREITS
+ bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
default y
help
- This option adds a workaround for Fujitsu-A64FX erratum E#010001.
- On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory
- accesses may cause undefined fault (Data abort, DFSC=0b111111).
- This fault occurs under a specific hardware condition when a
- load/store instruction performs an address translation using:
- case-1 TTBR0_EL1 with TCR_EL1.NFD0 == 1.
- case-2 TTBR0_EL2 with TCR_EL2.NFD0 == 1.
- case-3 TTBR1_EL1 with TCR_EL1.NFD1 == 1.
- case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1.
-
- The workaround is to ensure these bits are clear in TCR_ELx.
- The workaround only affects the Fujitsu-A64FX.
+ Socionext Synquacer SoCs implement a separate h/w block to generate
+ MSI doorbell writes with non-zero values for the device ID.
If unsure, say Y.
@@ -1025,6 +1029,10 @@ config ARCH_HAS_CACHE_LINE_SIZE
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
def_bool y if PGTABLE_LEVELS > 2
+# Supported by clang >= 7.0
+config CC_HAVE_SHADOW_CALL_STACK
+ def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
---help---
@@ -1584,6 +1592,48 @@ endmenu
menu "ARMv8.5 architectural features"
+config ARM64_BTI
+ bool "Branch Target Identification support"
+ default y
+ help
+ Branch Target Identification (part of the ARMv8.5 Extensions)
+ provides a mechanism to limit the set of locations to which computed
+ branch instructions such as BR or BLR can jump.
+
+ To make use of BTI on CPUs that support it, say Y.
+
+ BTI is intended to provide complementary protection to other control
+ flow integrity protection mechanisms, such as the Pointer
+ authentication mechanism provided as part of the ARMv8.3 Extensions.
+ For this reason, it does not make sense to enable this option without
+ also enabling support for pointer authentication. Thus, when
+ enabling this option you should also select ARM64_PTR_AUTH=y.
+
+ Userspace binaries must also be specifically compiled to make use of
+ this mechanism. If you say N here or the hardware does not support
+ BTI, such binaries can still run, but you get no additional
+ enforcement of branch destinations.
+
+config ARM64_BTI_KERNEL
+ bool "Use Branch Target Identification for kernel"
+ default y
+ depends on ARM64_BTI
+ depends on ARM64_PTR_AUTH
+ depends on CC_HAS_BRANCH_PROT_PAC_RET_BTI
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94697
+ depends on !CC_IS_GCC || GCC_VERSION >= 100100
+ depends on !(CC_IS_CLANG && GCOV_KERNEL)
+ depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
+ help
+ Build the kernel with Branch Target Identification annotations
+ and enable enforcement of this for kernel code. When this option
+ is enabled and the system supports BTI all kernel code including
+ modular code must have BTI enabled.
+
+config CC_HAS_BRANCH_PROT_PAC_RET_BTI
+ # GCC 9 or later, clang 8 or later
+ def_bool $(cc-option,-mbranch-protection=pac-ret+leaf+bti)
+
config ARM64_E0PD
bool "Enable support for E0PD"
default y
@@ -1785,7 +1835,7 @@ config EFI
select EFI_PARAMS_FROM_FDT
select EFI_RUNTIME_WRAPPERS
select EFI_STUB
- select EFI_ARMSTUB
+ select EFI_GENERIC_STUB
default y
help
This option provides support for runtime services provided
diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug
index a1efa246c9ed..cdf7ec0b975e 100644
--- a/arch/arm64/Kconfig.debug
+++ b/arch/arm64/Kconfig.debug
@@ -23,35 +23,6 @@ config ARM64_RANDOMIZE_TEXT_OFFSET
of TEXT_OFFSET and platforms must not require a specific
value.
-config DEBUG_WX
- bool "Warn on W+X mappings at boot"
- select PTDUMP_CORE
- ---help---
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
- This check also includes UXN, which should be set on all kernel
- mappings.
-
- Look for a message in dmesg output like this:
-
- arm64/mm: Checked W+X mappings: passed, no W+X pages found.
-
- or like this, if the check failed:
-
- arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
config DEBUG_EFI
depends on EFI && DEBUG_INFO
bool "UEFI debugging"
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 3c7e310fd8bf..44537dcfd251 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -274,12 +274,9 @@ config ARCH_UNIPHIER
config ARCH_VEXPRESS
bool "ARMv8 software model (Versatile Express)"
- select COMMON_CLK_VERSATILE
select GPIOLIB
select PM
select PM_GENERIC_DOMAINS
- select POWER_RESET_VEXPRESS
- select VEXPRESS_CONFIG
help
This enables support for the ARMv8 software model (Versatile
Express).
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 85e4149cc5d5..650e1185c190 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -12,7 +12,6 @@
LDFLAGS_vmlinux :=--no-undefined -X
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
-GZFLAGS :=-9
ifeq ($(CONFIG_RELOCATABLE), y)
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
@@ -71,7 +70,14 @@ branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
ifeq ($(CONFIG_ARM64_PTR_AUTH),y)
branch-prot-flags-$(CONFIG_CC_HAS_SIGN_RETURN_ADDRESS) := -msign-return-address=all
+# We enable additional protection for leaf functions as there is some
+# narrow potential for ROP protection benefits and no substantial
+# performance impact has been observed.
+ifeq ($(CONFIG_ARM64_BTI_KERNEL),y)
+branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI) := -mbranch-protection=pac-ret+leaf+bti
+else
branch-prot-flags-$(CONFIG_CC_HAS_BRANCH_PROT_PAC_RET) := -mbranch-protection=pac-ret+leaf
+endif
# -march=armv8.3-a enables the non-nops instructions for PAC, to avoid the
# compiler to generate them and consequently to break the single image contract
# we pass it only to the assembler. This option is utilized only in case of non
@@ -81,6 +87,10 @@ endif
KBUILD_CFLAGS += $(branch-prot-flags-y)
+ifeq ($(CONFIG_SHADOW_CALL_STACK), y)
+KBUILD_CFLAGS += -ffixed-x18
+endif
+
ifeq ($(CONFIG_CPU_BIG_ENDIAN), y)
KBUILD_CPPFLAGS += -mbig-endian
CHECKFLAGS += -D__AARCH64EB__
@@ -118,7 +128,7 @@ TEXT_OFFSET := $(shell awk "BEGIN {srand(); printf \"0x%06x\n\", \
int(2 * 1024 * 1024 / (2 ^ $(CONFIG_ARM64_PAGE_SHIFT)) * \
rand()) * (2 ^ $(CONFIG_ARM64_PAGE_SHIFT))}")
else
-TEXT_OFFSET := 0x00080000
+TEXT_OFFSET := 0x0
endif
ifeq ($(CONFIG_KASAN_SW_TAGS), y)
@@ -131,7 +141,7 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT)
-export TEXT_OFFSET GZFLAGS
+export TEXT_OFFSET
core-y += arch/arm64/
libs-y := arch/arm64/lib/ $(libs-y)
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
index 316e8a443913..dc4ab6b434f9 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pinetab.dts
@@ -98,7 +98,7 @@
};
&codec_analog {
- hpvcc-supply = <&reg_eldo1>;
+ cpvdd-supply = <&reg_eldo1>;
status = "okay";
};
diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
index 31143fe64d91..c26cc1fcaffd 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
@@ -154,24 +154,6 @@
};
};
- sound_spdif {
- compatible = "simple-audio-card";
- simple-audio-card,name = "On-board SPDIF";
-
- simple-audio-card,cpu {
- sound-dai = <&spdif>;
- };
-
- simple-audio-card,codec {
- sound-dai = <&spdif_out>;
- };
- };
-
- spdif_out: spdif-out {
- #sound-dai-cells = <0>;
- compatible = "linux,spdif-dit";
- };
-
timer {
compatible = "arm,armv8-timer";
allwinner,erratum-unknown1;
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
index 0882ea215b88..c0aef7d69117 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
@@ -2319,7 +2319,7 @@
reg = <0x0 0xff400000 0x0 0x40000>;
interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clkc CLKID_USB1_DDR_BRIDGE>;
- clock-names = "ddr";
+ clock-names = "otg";
phys = <&usb2_phy1>;
phy-names = "usb2-phy";
dr_mode = "peripheral";
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12.dtsi
index 783e5a397f86..55d39020ec72 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12.dtsi
@@ -1,4 +1,3 @@
-
// SPDX-License-Identifier: (GPL-2.0+ OR MIT)
/*
* Copyright (c) 2019 BayLibre, SAS
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi
index c33e85fbdaba..c6c8caed8327 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12b-khadas-vim3.dtsi
@@ -154,6 +154,10 @@
clock-latency = <50000>;
};
+&frddr_a {
+ status = "okay";
+};
+
&frddr_b {
status = "okay";
};
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts b/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts
index 325e448eb09c..06c5430eb92d 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-g12b-ugoos-am6.dts
@@ -545,7 +545,7 @@
&usb {
status = "okay";
dr_mode = "host";
- vbus-regulator = <&usb_pwr_en>;
+ vbus-supply = <&usb_pwr_en>;
};
&usb2_phy0 {
diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
index 2a7f70b71149..13d0570c7ed6 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
@@ -447,7 +447,7 @@
edma0: dma-controller@22c0000 {
#dma-cells = <2>;
- compatible = "fsl,ls1028a-edma";
+ compatible = "fsl,ls1028a-edma", "fsl,vf610-edma";
reg = <0x0 0x22c0000 0x0 0x10000>,
<0x0 0x22d0000 0x0 0x10000>,
<0x0 0x22e0000 0x0 0x10000>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mm.dtsi b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
index cc7152ecedd9..8829628f757a 100644
--- a/arch/arm64/boot/dts/freescale/imx8mm.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mm.dtsi
@@ -264,7 +264,7 @@
aips1: bus@30000000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x301f0000 0x10000>;
+ reg = <0x30000000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30000000 0x30000000 0x400000>;
@@ -543,7 +543,7 @@
aips2: bus@30400000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x305f0000 0x10000>;
+ reg = <0x30400000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30400000 0x30400000 0x400000>;
@@ -603,7 +603,7 @@
aips3: bus@30800000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x309f0000 0x10000>;
+ reg = <0x30800000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30800000 0x30800000 0x400000>,
@@ -863,7 +863,7 @@
aips4: bus@32c00000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x32df0000 0x10000>;
+ reg = <0x32c00000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x32c00000 0x32c00000 0x400000>;
diff --git a/arch/arm64/boot/dts/freescale/imx8mn.dtsi b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
index fa78f0163270..43971abe218b 100644
--- a/arch/arm64/boot/dts/freescale/imx8mn.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mn.dtsi
@@ -241,7 +241,7 @@
aips1: bus@30000000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x301f0000 0x10000>;
+ reg = <0x30000000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
@@ -448,7 +448,7 @@
aips2: bus@30400000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x305f0000 0x10000>;
+ reg = <0x30400000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
@@ -508,7 +508,7 @@
aips3: bus@30800000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x309f0000 0x10000>;
+ reg = <0x30800000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
@@ -718,7 +718,7 @@
reg = <0x30bd0000 0x10000>;
interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clk IMX8MN_CLK_SDMA1_ROOT>,
- <&clk IMX8MN_CLK_SDMA1_ROOT>;
+ <&clk IMX8MN_CLK_AHB>;
clock-names = "ipg", "ahb";
#dma-cells = <3>;
fsl,sdma-ram-script-name = "imx/sdma/sdma-imx7d.bin";
@@ -754,7 +754,7 @@
aips4: bus@32c00000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x32df0000 0x10000>;
+ reg = <0x32c00000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
diff --git a/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h b/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h
index da78f89b6c98..319ab34cab3e 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h
+++ b/arch/arm64/boot/dts/freescale/imx8mp-pinfunc.h
@@ -151,26 +151,26 @@
#define MX8MP_IOMUXC_ENET_TXC__SIM_M_HADDR22 0x070 0x2D0 0x000 0x7 0x0
#define MX8MP_IOMUXC_ENET_RX_CTL__ENET_QOS_RGMII_RX_CTL 0x074 0x2D4 0x000 0x0 0x0
#define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_SAI7_TX_SYNC 0x074 0x2D4 0x540 0x2 0x0
-#define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_BIT_STREAM03 0x074 0x2D4 0x4CC 0x3 0x0
+#define MX8MP_IOMUXC_ENET_RX_CTL__AUDIOMIX_BIT_STREAM03 0x074 0x2D4 0x4CC 0x3 0x1
#define MX8MP_IOMUXC_ENET_RX_CTL__GPIO1_IO24 0x074 0x2D4 0x000 0x5 0x0
#define MX8MP_IOMUXC_ENET_RX_CTL__USDHC3_DATA2 0x074 0x2D4 0x618 0x6 0x0
#define MX8MP_IOMUXC_ENET_RX_CTL__SIM_M_HADDR23 0x074 0x2D4 0x000 0x7 0x0
#define MX8MP_IOMUXC_ENET_RXC__CCM_ENET_QOS_CLOCK_GENERATE_RX_CLK 0x078 0x2D8 0x000 0x0 0x0
#define MX8MP_IOMUXC_ENET_RXC__ENET_QOS_RX_ER 0x078 0x2D8 0x000 0x1 0x0
#define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_SAI7_TX_BCLK 0x078 0x2D8 0x53C 0x2 0x0
-#define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_BIT_STREAM02 0x078 0x2D8 0x4C8 0x3 0x0
+#define MX8MP_IOMUXC_ENET_RXC__AUDIOMIX_BIT_STREAM02 0x078 0x2D8 0x4C8 0x3 0x1
#define MX8MP_IOMUXC_ENET_RXC__GPIO1_IO25 0x078 0x2D8 0x000 0x5 0x0
#define MX8MP_IOMUXC_ENET_RXC__USDHC3_DATA3 0x078 0x2D8 0x61C 0x6 0x0
#define MX8MP_IOMUXC_ENET_RXC__SIM_M_HADDR24 0x078 0x2D8 0x000 0x7 0x0
#define MX8MP_IOMUXC_ENET_RD0__ENET_QOS_RGMII_RD0 0x07C 0x2DC 0x000 0x0 0x0
#define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_SAI7_RX_DATA00 0x07C 0x2DC 0x534 0x2 0x0
-#define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_BIT_STREAM01 0x07C 0x2DC 0x4C4 0x3 0x0
+#define MX8MP_IOMUXC_ENET_RD0__AUDIOMIX_BIT_STREAM01 0x07C 0x2DC 0x4C4 0x3 0x1
#define MX8MP_IOMUXC_ENET_RD0__GPIO1_IO26 0x07C 0x2DC 0x000 0x5 0x0
#define MX8MP_IOMUXC_ENET_RD0__USDHC3_DATA4 0x07C 0x2DC 0x620 0x6 0x0
#define MX8MP_IOMUXC_ENET_RD0__SIM_M_HADDR25 0x07C 0x2DC 0x000 0x7 0x0
#define MX8MP_IOMUXC_ENET_RD1__ENET_QOS_RGMII_RD1 0x080 0x2E0 0x000 0x0 0x0
#define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_SAI7_RX_SYNC 0x080 0x2E0 0x538 0x2 0x0
-#define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_BIT_STREAM00 0x080 0x2E0 0x4C0 0x3 0x0
+#define MX8MP_IOMUXC_ENET_RD1__AUDIOMIX_BIT_STREAM00 0x080 0x2E0 0x4C0 0x3 0x1
#define MX8MP_IOMUXC_ENET_RD1__GPIO1_IO27 0x080 0x2E0 0x000 0x5 0x0
#define MX8MP_IOMUXC_ENET_RD1__USDHC3_RESET_B 0x080 0x2E0 0x000 0x6 0x0
#define MX8MP_IOMUXC_ENET_RD1__SIM_M_HADDR26 0x080 0x2E0 0x000 0x7 0x0
@@ -291,7 +291,7 @@
#define MX8MP_IOMUXC_SD2_DATA0__I2C4_SDA 0x0C8 0x328 0x5C0 0x2 0x1
#define MX8MP_IOMUXC_SD2_DATA0__UART2_DCE_RX 0x0C8 0x328 0x5F0 0x3 0x2
#define MX8MP_IOMUXC_SD2_DATA0__UART2_DTE_TX 0x0C8 0x328 0x000 0x3 0x0
-#define MX8MP_IOMUXC_SD2_DATA0__AUDIOMIX_BIT_STREAM00 0x0C8 0x328 0x4C0 0x4 0x1
+#define MX8MP_IOMUXC_SD2_DATA0__AUDIOMIX_BIT_STREAM00 0x0C8 0x328 0x4C0 0x4 0x2
#define MX8MP_IOMUXC_SD2_DATA0__GPIO2_IO15 0x0C8 0x328 0x000 0x5 0x0
#define MX8MP_IOMUXC_SD2_DATA0__CCMSRCGPCMIX_OBSERVE2 0x0C8 0x328 0x000 0x6 0x0
#define MX8MP_IOMUXC_SD2_DATA0__OBSERVE_MUX_OUT02 0x0C8 0x328 0x000 0x7 0x0
@@ -313,7 +313,7 @@
#define MX8MP_IOMUXC_SD2_DATA3__USDHC2_DATA3 0x0D4 0x334 0x000 0x0 0x0
#define MX8MP_IOMUXC_SD2_DATA3__ECSPI2_MISO 0x0D4 0x334 0x56C 0x2 0x0
#define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_SPDIF_IN 0x0D4 0x334 0x544 0x3 0x1
-#define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_BIT_STREAM03 0x0D4 0x334 0x4CC 0x4 0x1
+#define MX8MP_IOMUXC_SD2_DATA3__AUDIOMIX_BIT_STREAM03 0x0D4 0x334 0x4CC 0x4 0x2
#define MX8MP_IOMUXC_SD2_DATA3__GPIO2_IO18 0x0D4 0x334 0x000 0x5 0x0
#define MX8MP_IOMUXC_SD2_DATA3__CCMSRCGPCMIX_EARLY_RESET 0x0D4 0x334 0x000 0x6 0x0
#define MX8MP_IOMUXC_SD2_RESET_B__USDHC2_RESET_B 0x0D8 0x338 0x000 0x0 0x0
@@ -487,27 +487,27 @@
#define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_SAI1_TX_DATA02 0x134 0x394 0x000 0x1 0x0
#define MX8MP_IOMUXC_SAI5_RXD0__PWM2_OUT 0x134 0x394 0x000 0x2 0x0
#define MX8MP_IOMUXC_SAI5_RXD0__I2C5_SCL 0x134 0x394 0x5C4 0x3 0x1
-#define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_BIT_STREAM00 0x134 0x394 0x4C0 0x4 0x2
+#define MX8MP_IOMUXC_SAI5_RXD0__AUDIOMIX_BIT_STREAM00 0x134 0x394 0x4C0 0x4 0x3
#define MX8MP_IOMUXC_SAI5_RXD0__GPIO3_IO21 0x134 0x394 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI5_RX_DATA01 0x138 0x398 0x4FC 0x0 0x0
#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI1_TX_DATA03 0x138 0x398 0x000 0x1 0x0
#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI1_TX_SYNC 0x138 0x398 0x4D8 0x2 0x0
#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_SAI5_TX_SYNC 0x138 0x398 0x510 0x3 0x0
-#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_BIT_STREAM01 0x138 0x398 0x4C4 0x4 0x2
+#define MX8MP_IOMUXC_SAI5_RXD1__AUDIOMIX_BIT_STREAM01 0x138 0x398 0x4C4 0x4 0x3
#define MX8MP_IOMUXC_SAI5_RXD1__GPIO3_IO22 0x138 0x398 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI5_RXD1__CAN1_TX 0x138 0x398 0x000 0x6 0x0
#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI5_RX_DATA02 0x13C 0x39C 0x500 0x0 0x0
#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI1_TX_DATA04 0x13C 0x39C 0x000 0x1 0x0
#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI1_TX_SYNC 0x13C 0x39C 0x4D8 0x2 0x1
#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_SAI5_TX_BCLK 0x13C 0x39C 0x50C 0x3 0x0
-#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_BIT_STREAM02 0x13C 0x39C 0x4C8 0x4 0x2
+#define MX8MP_IOMUXC_SAI5_RXD2__AUDIOMIX_BIT_STREAM02 0x13C 0x39C 0x4C8 0x4 0x3
#define MX8MP_IOMUXC_SAI5_RXD2__GPIO3_IO23 0x13C 0x39C 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI5_RXD2__CAN1_RX 0x13C 0x39C 0x54C 0x6 0x0
#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI5_RX_DATA03 0x140 0x3A0 0x504 0x0 0x0
#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI1_TX_DATA05 0x140 0x3A0 0x000 0x1 0x0
#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI1_TX_SYNC 0x140 0x3A0 0x4D8 0x2 0x2
#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_SAI5_TX_DATA00 0x140 0x3A0 0x000 0x3 0x0
-#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_BIT_STREAM03 0x140 0x3A0 0x4CC 0x4 0x2
+#define MX8MP_IOMUXC_SAI5_RXD3__AUDIOMIX_BIT_STREAM03 0x140 0x3A0 0x4CC 0x4 0x3
#define MX8MP_IOMUXC_SAI5_RXD3__GPIO3_IO24 0x140 0x3A0 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI5_RXD3__CAN2_TX 0x140 0x3A0 0x000 0x6 0x0
#define MX8MP_IOMUXC_SAI5_MCLK__AUDIOMIX_SAI5_MCLK 0x144 0x3A4 0x4F0 0x0 0x0
@@ -528,22 +528,22 @@
#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI1_RX_DATA00 0x150 0x3B0 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI5_RX_DATA00 0x150 0x3B0 0x4F8 0x1 0x1
#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_SAI1_TX_DATA01 0x150 0x3B0 0x000 0x2 0x0
-#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_BIT_STREAM00 0x150 0x3B0 0x4C0 0x3 0x3
+#define MX8MP_IOMUXC_SAI1_RXD0__AUDIOMIX_BIT_STREAM00 0x150 0x3B0 0x4C0 0x3 0x4
#define MX8MP_IOMUXC_SAI1_RXD0__ENET1_1588_EVENT1_IN 0x150 0x3B0 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI1_RXD0__GPIO4_IO02 0x150 0x3B0 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_SAI1_RX_DATA01 0x154 0x3B4 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_SAI5_RX_DATA01 0x154 0x3B4 0x4FC 0x1 0x1
-#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_BIT_STREAM01 0x154 0x3B4 0x4C4 0x3 0x3
+#define MX8MP_IOMUXC_SAI1_RXD1__AUDIOMIX_BIT_STREAM01 0x154 0x3B4 0x4C4 0x3 0x4
#define MX8MP_IOMUXC_SAI1_RXD1__ENET1_1588_EVENT1_OUT 0x154 0x3B4 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI1_RXD1__GPIO4_IO03 0x154 0x3B4 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_SAI1_RX_DATA02 0x158 0x3B8 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_SAI5_RX_DATA02 0x158 0x3B8 0x500 0x1 0x1
-#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_BIT_STREAM02 0x158 0x3B8 0x4C8 0x3 0x3
+#define MX8MP_IOMUXC_SAI1_RXD2__AUDIOMIX_BIT_STREAM02 0x158 0x3B8 0x4C8 0x3 0x4
#define MX8MP_IOMUXC_SAI1_RXD2__ENET1_MDC 0x158 0x3B8 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI1_RXD2__GPIO4_IO04 0x158 0x3B8 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_SAI1_RX_DATA03 0x15C 0x3BC 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_SAI5_RX_DATA03 0x15C 0x3BC 0x504 0x1 0x1
-#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_BIT_STREAM03 0x15C 0x3BC 0x4CC 0x3 0x3
+#define MX8MP_IOMUXC_SAI1_RXD3__AUDIOMIX_BIT_STREAM03 0x15C 0x3BC 0x4CC 0x3 0x4
#define MX8MP_IOMUXC_SAI1_RXD3__ENET1_MDIO 0x15C 0x3BC 0x57C 0x4 0x1
#define MX8MP_IOMUXC_SAI1_RXD3__GPIO4_IO05 0x15C 0x3BC 0x000 0x5 0x0
#define MX8MP_IOMUXC_SAI1_RXD4__AUDIOMIX_SAI1_RX_DATA04 0x160 0x3C0 0x000 0x0 0x0
@@ -624,7 +624,7 @@
#define MX8MP_IOMUXC_SAI2_RXFS__UART1_DCE_TX 0x19C 0x3FC 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI2_RXFS__UART1_DTE_RX 0x19C 0x3FC 0x5E8 0x4 0x2
#define MX8MP_IOMUXC_SAI2_RXFS__GPIO4_IO21 0x19C 0x3FC 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI2_RXFS__AUDIOMIX_BIT_STREAM02 0x19C 0x3FC 0x4C8 0x6 0x4
+#define MX8MP_IOMUXC_SAI2_RXFS__AUDIOMIX_BIT_STREAM02 0x19C 0x3FC 0x4C8 0x6 0x5
#define MX8MP_IOMUXC_SAI2_RXFS__SIM_M_HSIZE00 0x19C 0x3FC 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_SAI2_RX_BCLK 0x1A0 0x400 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_SAI5_TX_BCLK 0x1A0 0x400 0x50C 0x1 0x2
@@ -632,7 +632,7 @@
#define MX8MP_IOMUXC_SAI2_RXC__UART1_DCE_RX 0x1A0 0x400 0x5E8 0x4 0x3
#define MX8MP_IOMUXC_SAI2_RXC__UART1_DTE_TX 0x1A0 0x400 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI2_RXC__GPIO4_IO22 0x1A0 0x400 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_BIT_STREAM01 0x1A0 0x400 0x4C4 0x6 0x4
+#define MX8MP_IOMUXC_SAI2_RXC__AUDIOMIX_BIT_STREAM01 0x1A0 0x400 0x4C4 0x6 0x5
#define MX8MP_IOMUXC_SAI2_RXC__SIM_M_HSIZE01 0x1A0 0x400 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_SAI2_RX_DATA00 0x1A4 0x404 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_SAI5_TX_DATA00 0x1A4 0x404 0x000 0x1 0x0
@@ -641,7 +641,7 @@
#define MX8MP_IOMUXC_SAI2_RXD0__UART1_DCE_RTS 0x1A4 0x404 0x5E4 0x4 0x2
#define MX8MP_IOMUXC_SAI2_RXD0__UART1_DTE_CTS 0x1A4 0x404 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI2_RXD0__GPIO4_IO23 0x1A4 0x404 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_BIT_STREAM03 0x1A4 0x404 0x4CC 0x6 0x4
+#define MX8MP_IOMUXC_SAI2_RXD0__AUDIOMIX_BIT_STREAM03 0x1A4 0x404 0x4CC 0x6 0x5
#define MX8MP_IOMUXC_SAI2_RXD0__SIM_M_HSIZE02 0x1A4 0x404 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI2_TX_SYNC 0x1A8 0x408 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_SAI5_TX_DATA01 0x1A8 0x408 0x000 0x1 0x0
@@ -650,13 +650,13 @@
#define MX8MP_IOMUXC_SAI2_TXFS__UART1_DCE_CTS 0x1A8 0x408 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI2_TXFS__UART1_DTE_RTS 0x1A8 0x408 0x5E4 0x4 0x3
#define MX8MP_IOMUXC_SAI2_TXFS__GPIO4_IO24 0x1A8 0x408 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_BIT_STREAM02 0x1A8 0x408 0x4C8 0x6 0x5
+#define MX8MP_IOMUXC_SAI2_TXFS__AUDIOMIX_BIT_STREAM02 0x1A8 0x408 0x4C8 0x6 0x6
#define MX8MP_IOMUXC_SAI2_TXFS__SIM_M_HWRITE 0x1A8 0x408 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI2_TX_BCLK 0x1AC 0x40C 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_SAI5_TX_DATA02 0x1AC 0x40C 0x000 0x1 0x0
#define MX8MP_IOMUXC_SAI2_TXC__CAN1_RX 0x1AC 0x40C 0x54C 0x3 0x1
#define MX8MP_IOMUXC_SAI2_TXC__GPIO4_IO25 0x1AC 0x40C 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_BIT_STREAM01 0x1AC 0x40C 0x4C4 0x6 0x5
+#define MX8MP_IOMUXC_SAI2_TXC__AUDIOMIX_BIT_STREAM01 0x1AC 0x40C 0x4C4 0x6 0x6
#define MX8MP_IOMUXC_SAI2_TXC__SIM_M_HREADYOUT 0x1AC 0x40C 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI2_TX_DATA00 0x1B0 0x410 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI2_TXD0__AUDIOMIX_SAI5_TX_DATA03 0x1B0 0x410 0x000 0x1 0x0
@@ -680,7 +680,7 @@
#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_SAI3_RX_DATA01 0x1B8 0x418 0x000 0x3 0x0
#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_SPDIF_IN 0x1B8 0x418 0x544 0x4 0x2
#define MX8MP_IOMUXC_SAI3_RXFS__GPIO4_IO28 0x1B8 0x418 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_BIT_STREAM00 0x1B8 0x418 0x4C0 0x6 0x4
+#define MX8MP_IOMUXC_SAI3_RXFS__AUDIOMIX_BIT_STREAM00 0x1B8 0x418 0x4C0 0x6 0x5
#define MX8MP_IOMUXC_SAI3_RXFS__TPSMP_HTRANS00 0x1B8 0x418 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI3_RXC__AUDIOMIX_SAI3_RX_BCLK 0x1BC 0x41C 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI3_RXC__AUDIOMIX_SAI2_RX_DATA02 0x1BC 0x41C 0x000 0x1 0x0
@@ -697,7 +697,7 @@
#define MX8MP_IOMUXC_SAI3_RXD__UART2_DCE_RTS 0x1C0 0x420 0x5EC 0x4 0x3
#define MX8MP_IOMUXC_SAI3_RXD__UART2_DTE_CTS 0x1C0 0x420 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI3_RXD__GPIO4_IO30 0x1C0 0x420 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI3_RXD__AUDIOMIX_BIT_STREAM01 0x1C0 0x420 0x4C4 0x6 0x6
+#define MX8MP_IOMUXC_SAI3_RXD__AUDIOMIX_BIT_STREAM01 0x1C0 0x420 0x4C4 0x6 0x7
#define MX8MP_IOMUXC_SAI3_RXD__TPSMP_HDATA00 0x1C0 0x420 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_SAI3_TX_SYNC 0x1C4 0x424 0x4EC 0x0 0x1
#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_SAI2_TX_DATA01 0x1C4 0x424 0x000 0x1 0x0
@@ -706,7 +706,7 @@
#define MX8MP_IOMUXC_SAI3_TXFS__UART2_DCE_RX 0x1C4 0x424 0x5F0 0x4 0x4
#define MX8MP_IOMUXC_SAI3_TXFS__UART2_DTE_TX 0x1C4 0x424 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI3_TXFS__GPIO4_IO31 0x1C4 0x424 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_BIT_STREAM03 0x1C4 0x424 0x4CC 0x6 0x5
+#define MX8MP_IOMUXC_SAI3_TXFS__AUDIOMIX_BIT_STREAM03 0x1C4 0x424 0x4CC 0x6 0x6
#define MX8MP_IOMUXC_SAI3_TXFS__TPSMP_HDATA01 0x1C4 0x424 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_SAI3_TX_BCLK 0x1C8 0x428 0x4E8 0x0 0x1
#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_SAI2_TX_DATA02 0x1C8 0x428 0x000 0x1 0x0
@@ -715,7 +715,7 @@
#define MX8MP_IOMUXC_SAI3_TXC__UART2_DCE_TX 0x1C8 0x428 0x000 0x4 0x0
#define MX8MP_IOMUXC_SAI3_TXC__UART2_DTE_RX 0x1C8 0x428 0x5F0 0x4 0x5
#define MX8MP_IOMUXC_SAI3_TXC__GPIO5_IO00 0x1C8 0x428 0x000 0x5 0x0
-#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_BIT_STREAM02 0x1C8 0x428 0x4C8 0x6 0x6
+#define MX8MP_IOMUXC_SAI3_TXC__AUDIOMIX_BIT_STREAM02 0x1C8 0x428 0x4C8 0x6 0x7
#define MX8MP_IOMUXC_SAI3_TXC__TPSMP_HDATA02 0x1C8 0x428 0x000 0x7 0x0
#define MX8MP_IOMUXC_SAI3_TXD__AUDIOMIX_SAI3_TX_DATA00 0x1CC 0x42C 0x000 0x0 0x0
#define MX8MP_IOMUXC_SAI3_TXD__AUDIOMIX_SAI2_TX_DATA03 0x1CC 0x42C 0x000 0x1 0x0
diff --git a/arch/arm64/boot/dts/freescale/imx8mp.dtsi b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
index 9b1616e59d58..9f6ba763238d 100644
--- a/arch/arm64/boot/dts/freescale/imx8mp.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mp.dtsi
@@ -145,7 +145,7 @@
aips1: bus@30000000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x301f0000 0x10000>;
+ reg = <0x30000000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
@@ -318,7 +318,7 @@
aips2: bus@30400000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x305f0000 0x400000>;
+ reg = <0x30400000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
@@ -378,7 +378,7 @@
aips3: bus@30800000 {
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x309f0000 0x400000>;
+ reg = <0x30800000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
diff --git a/arch/arm64/boot/dts/freescale/imx8mq.dtsi b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
index 75b384217a23..bab88369be1b 100644
--- a/arch/arm64/boot/dts/freescale/imx8mq.dtsi
+++ b/arch/arm64/boot/dts/freescale/imx8mq.dtsi
@@ -291,7 +291,7 @@
bus@30000000 { /* AIPS1 */
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x301f0000 0x10000>;
+ reg = <0x30000000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30000000 0x30000000 0x400000>;
@@ -696,7 +696,7 @@
bus@30400000 { /* AIPS2 */
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x305f0000 0x10000>;
+ reg = <0x30400000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30400000 0x30400000 0x400000>;
@@ -756,7 +756,7 @@
bus@30800000 { /* AIPS3 */
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x309f0000 0x10000>;
+ reg = <0x30800000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x30800000 0x30800000 0x400000>,
@@ -1029,7 +1029,7 @@
bus@32c00000 { /* AIPS4 */
compatible = "fsl,aips-bus", "simple-bus";
- reg = <0x32df0000 0x10000>;
+ reg = <0x32c00000 0x400000>;
#address-cells = <1>;
#size-cells = <1>;
ranges = <0x32c00000 0x32c00000 0x400000>;
diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index ccb8e88a60c5..d819e44d94a8 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -1402,8 +1402,8 @@
"venc_lt_sel";
assigned-clocks = <&topckgen CLK_TOP_VENC_SEL>,
<&topckgen CLK_TOP_VENC_LT_SEL>;
- assigned-clock-parents = <&topckgen CLK_TOP_VENCPLL_D2>,
- <&topckgen CLK_TOP_UNIVPLL1_D2>;
+ assigned-clock-parents = <&topckgen CLK_TOP_VCODECPLL>,
+ <&topckgen CLK_TOP_VCODECPLL_370P5>;
};
jpegdec: jpegdec@18004000 {
diff --git a/arch/arm64/boot/dts/mediatek/mt8516.dtsi b/arch/arm64/boot/dts/mediatek/mt8516.dtsi
index 2f8adf042195..89af661e7f63 100644
--- a/arch/arm64/boot/dts/mediatek/mt8516.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8516.dtsi
@@ -191,6 +191,11 @@
#clock-cells = <1>;
};
+ pericfg: pericfg@10003050 {
+ compatible = "mediatek,mt8516-pericfg", "syscon";
+ reg = <0 0x10003050 0 0x1000>;
+ };
+
apmixedsys: apmixedsys@10018000 {
compatible = "mediatek,mt8516-apmixedsys", "syscon";
reg = <0 0x10018000 0 0x710>;
@@ -401,6 +406,18 @@
status = "disabled";
};
+ ethernet: ethernet@11180000 {
+ compatible = "mediatek,mt8516-eth";
+ reg = <0 0x11180000 0 0x1000>;
+ mediatek,pericfg = <&pericfg>;
+ interrupts = <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&topckgen CLK_TOP_RG_ETH>,
+ <&topckgen CLK_TOP_66M_ETH>,
+ <&topckgen CLK_TOP_133M_ETH>;
+ clock-names = "core", "reg", "trans";
+ status = "disabled";
+ };
+
rng: rng@1020c000 {
compatible = "mediatek,mt8516-rng",
"mediatek,mt7623-rng";
diff --git a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi
index a31093d7142b..dfceffe6950a 100644
--- a/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi
+++ b/arch/arm64/boot/dts/mediatek/pumpkin-common.dtsi
@@ -9,6 +9,7 @@
/ {
aliases {
serial0 = &uart0;
+ ethernet0 = &ethernet;
};
chosen {
@@ -166,6 +167,24 @@
status = "okay";
};
+&ethernet {
+ pinctrl-names = "default";
+ pinctrl-0 = <&ethernet_pins_default>;
+ phy-handle = <&eth_phy>;
+ phy-mode = "rmii";
+ mac-address = [00 00 00 00 00 00];
+ status = "okay";
+
+ mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ eth_phy: ethernet-phy@0 {
+ reg = <0>;
+ };
+ };
+};
+
&usb0 {
status = "okay";
dr_mode = "peripheral";
@@ -218,4 +237,19 @@
bias-pull-up;
};
};
+
+ ethernet_pins_default: ethernet {
+ pins_ethernet {
+ pinmux = <MT8516_PIN_0_EINT0__FUNC_EXT_TXD0>,
+ <MT8516_PIN_1_EINT1__FUNC_EXT_TXD1>,
+ <MT8516_PIN_5_EINT5__FUNC_EXT_RXER>,
+ <MT8516_PIN_6_EINT6__FUNC_EXT_RXC>,
+ <MT8516_PIN_7_EINT7__FUNC_EXT_RXDV>,
+ <MT8516_PIN_8_EINT8__FUNC_EXT_RXD0>,
+ <MT8516_PIN_9_EINT9__FUNC_EXT_RXD1>,
+ <MT8516_PIN_12_EINT12__FUNC_EXT_TXEN>,
+ <MT8516_PIN_38_MRG_DI__FUNC_EXT_MDIO>,
+ <MT8516_PIN_39_MRG_DO__FUNC_EXT_MDC>;
+ };
+ };
};
diff --git a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
index af87350b5547..c4abbccf2bed 100644
--- a/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
+++ b/arch/arm64/boot/dts/qcom/apq8096-db820c.dtsi
@@ -658,8 +658,8 @@
s11 {
qcom,saw-leader;
regulator-always-on;
- regulator-min-microvolt = <1230000>;
- regulator-max-microvolt = <1230000>;
+ regulator-min-microvolt = <980000>;
+ regulator-max-microvolt = <980000>;
};
};
@@ -908,10 +908,27 @@
status = "okay";
};
+&q6asmdai {
+ dai@0 {
+ reg = <0>;
+ };
+
+ dai@1 {
+ reg = <1>;
+ };
+
+ dai@2 {
+ reg = <2>;
+ };
+};
+
&sound {
compatible = "qcom,apq8096-sndcard";
model = "DB820c";
- audio-routing = "RX_BIAS", "MCLK";
+ audio-routing = "RX_BIAS", "MCLK",
+ "MM_DL1", "MultiMedia1 Playback",
+ "MM_DL2", "MultiMedia2 Playback",
+ "MultiMedia3 Capture", "MM_UL3";
mm1-dai-link {
link-name = "MultiMedia1";
diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 14827adebd94..98634d5c4440 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -2066,6 +2066,8 @@
reg = <APR_SVC_ASM>;
q6asmdai: dais {
compatible = "qcom,q6asm-dais";
+ #address-cells = <1>;
+ #size-cells = <0>;
#sound-dai-cells = <1>;
iommus = <&lpass_q6_smmu 1>;
};
diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
index a2e05926b429..21fd6f8d5799 100644
--- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
+++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
@@ -442,17 +442,14 @@
&q6asmdai {
dai@0 {
reg = <0>;
- direction = <2>;
};
dai@1 {
reg = <1>;
- direction = <2>;
};
dai@2 {
reg = <2>;
- direction = <1>;
};
dai@3 {
diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi b/arch/arm64/boot/dts/qcom/sdm845.dtsi
index 8f926b5234d4..de6bb86c4968 100644
--- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
@@ -1761,6 +1761,8 @@
ipa: ipa@1e40000 {
compatible = "qcom,sdm845-ipa";
+
+ iommus = <&apps_smmu 0x720 0x3>;
reg = <0 0x1e40000 0 0x7000>,
<0 0x1e47000 0 0x2000>,
<0 0x1e04000 0 0x2c000>;
diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
index 3b617a75fafa..51a670ad15b2 100644
--- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
+++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
@@ -359,12 +359,10 @@
&q6asmdai {
dai@0 {
reg = <0>;
- direction = <2>;
};
dai@1 {
reg = <1>;
- direction = <1>;
};
};
diff --git a/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts b/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts
index 2afb91ec9c8d..ac2156ab3e62 100644
--- a/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77970-eagle.dts
@@ -137,8 +137,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts b/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts
index d7c7b9156e08..01c4ba0f7be1 100644
--- a/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77970-v3msk.dts
@@ -150,8 +150,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/renesas/r8a77980-condor.dts b/arch/arm64/boot/dts/renesas/r8a77980-condor.dts
index 3dde028e22a6..ef8350a062af 100644
--- a/arch/arm64/boot/dts/renesas/r8a77980-condor.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77980-condor.dts
@@ -174,8 +174,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts b/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts
index adbfd8f07d06..6dff04693223 100644
--- a/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dts
@@ -141,8 +141,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/renesas/r8a77980.dtsi b/arch/arm64/boot/dts/renesas/r8a77980.dtsi
index e01b0508a18f..d672b320bc14 100644
--- a/arch/arm64/boot/dts/renesas/r8a77980.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a77980.dtsi
@@ -1318,6 +1318,7 @@
ipmmu_vip0: mmu@e7b00000 {
compatible = "renesas,ipmmu-r8a77980";
reg = <0 0xe7b00000 0 0x1000>;
+ renesas,ipmmu-main = <&ipmmu_mm 4>;
power-domains = <&sysc R8A77980_PD_ALWAYS_ON>;
#iommu-cells = <1>;
};
@@ -1325,6 +1326,7 @@
ipmmu_vip1: mmu@e7960000 {
compatible = "renesas,ipmmu-r8a77980";
reg = <0 0xe7960000 0 0x1000>;
+ renesas,ipmmu-main = <&ipmmu_mm 11>;
power-domains = <&sysc R8A77980_PD_ALWAYS_ON>;
#iommu-cells = <1>;
};
diff --git a/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts b/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts
index 4fd2b14fbb8b..dc24cec46ae1 100644
--- a/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77990-ebisu.dts
@@ -360,8 +360,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts
index 67634cb01d6b..79c73a99d2fe 100644
--- a/arch/arm64/boot/dts/renesas/r8a77995-draak.dts
+++ b/arch/arm64/boot/dts/renesas/r8a77995-draak.dts
@@ -272,8 +272,8 @@
hdmi-encoder@39 {
compatible = "adi,adv7511w";
- reg = <0x39>, <0x3f>, <0x38>, <0x3c>;
- reg-names = "main", "edid", "packet", "cec";
+ reg = <0x39>, <0x3f>, <0x3c>, <0x38>;
+ reg-names = "main", "edid", "cec", "packet";
interrupt-parent = <&gpio1>;
interrupts = <28 IRQ_TYPE_LEVEL_LOW>;
@@ -284,8 +284,6 @@
adi,input-depth = <8>;
adi,input-colorspace = "rgb";
adi,input-clock = "1x";
- adi,input-style = <1>;
- adi,input-justification = "evenly";
ports {
#address-cells = <1>;
diff --git a/arch/arm64/boot/dts/rockchip/px30.dtsi b/arch/arm64/boot/dts/rockchip/px30.dtsi
index f809dd6d5dc3..adc9b8bf5eaa 100644
--- a/arch/arm64/boot/dts/rockchip/px30.dtsi
+++ b/arch/arm64/boot/dts/rockchip/px30.dtsi
@@ -143,7 +143,7 @@
};
arm-pmu {
- compatible = "arm,cortex-a53-pmu";
+ compatible = "arm,cortex-a35-pmu";
interrupts = <GIC_SPI 100 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 101 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/boot/dts/rockchip/rk3308.dtsi b/arch/arm64/boot/dts/rockchip/rk3308.dtsi
index ac43bc3f7031..ac7f694079d0 100644
--- a/arch/arm64/boot/dts/rockchip/rk3308.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3308.dtsi
@@ -127,7 +127,7 @@
};
arm-pmu {
- compatible = "arm,cortex-a53-pmu";
+ compatible = "arm,cortex-a35-pmu";
interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 85 IRQ_TYPE_LEVEL_HIGH>,
diff --git a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts
index 49c4b96da3d4..ac29c2744d08 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts
@@ -82,17 +82,16 @@
&gmac2phy {
phy-supply = <&vcc_phy>;
clock_in_out = "output";
- assigned-clocks = <&cru SCLK_MAC2PHY_SRC>;
assigned-clock-rate = <50000000>;
assigned-clocks = <&cru SCLK_MAC2PHY>;
assigned-clock-parents = <&cru SCLK_MAC2PHY_SRC>;
-
+ status = "okay";
};
&i2c1 {
status = "okay";
- rk805: rk805@18 {
+ rk805: pmic@18 {
compatible = "rockchip,rk805";
reg = <0x18>;
interrupt-parent = <&gpio2>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
index bf3e546f5266..ebf3eb222e1f 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-rock64.dts
@@ -170,7 +170,7 @@
&i2c1 {
status = "okay";
- rk805: rk805@18 {
+ rk805: pmic@18 {
compatible = "rockchip,rk805";
reg = <0x18>;
interrupt-parent = <&gpio2>;
diff --git a/arch/arm64/boot/dts/rockchip/rk3328.dtsi b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
index 7e88d88aab98..a4d591d91533 100644
--- a/arch/arm64/boot/dts/rockchip/rk3328.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3328.dtsi
@@ -299,8 +299,6 @@
grf: syscon@ff100000 {
compatible = "rockchip,rk3328-grf", "syscon", "simple-mfd";
reg = <0x0 0xff100000 0x0 0x1000>;
- #address-cells = <1>;
- #size-cells = <1>;
io_domains: io-domains {
compatible = "rockchip,rk3328-io-voltage-domain";
@@ -1794,10 +1792,6 @@
};
gmac2phy {
- fephyled_speed100: fephyled-speed100 {
- rockchip,pins = <0 RK_PD7 1 &pcfg_pull_none>;
- };
-
fephyled_speed10: fephyled-speed10 {
rockchip,pins = <0 RK_PD6 1 &pcfg_pull_none>;
};
@@ -1806,18 +1800,6 @@
rockchip,pins = <0 RK_PD6 2 &pcfg_pull_none>;
};
- fephyled_rxm0: fephyled-rxm0 {
- rockchip,pins = <0 RK_PD5 1 &pcfg_pull_none>;
- };
-
- fephyled_txm0: fephyled-txm0 {
- rockchip,pins = <0 RK_PD5 2 &pcfg_pull_none>;
- };
-
- fephyled_linkm0: fephyled-linkm0 {
- rockchip,pins = <0 RK_PD4 1 &pcfg_pull_none>;
- };
-
fephyled_rxm1: fephyled-rxm1 {
rockchip,pins = <2 RK_PD1 2 &pcfg_pull_none>;
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
index 5ea281b55fe2..c49982dfd8fc 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3399-pinebook-pro.dts
@@ -147,7 +147,7 @@
"Speaker", "Speaker Amplifier OUTL",
"Speaker", "Speaker Amplifier OUTR";
- simple-audio-card,hp-det-gpio = <&gpio0 RK_PB0 GPIO_ACTIVE_LOW>;
+ simple-audio-card,hp-det-gpio = <&gpio0 RK_PB0 GPIO_ACTIVE_HIGH>;
simple-audio-card,aux-devs = <&speaker_amp>;
simple-audio-card,pin-switches = "Speaker";
@@ -690,7 +690,8 @@
fusb0: fusb30x@22 {
compatible = "fcs,fusb302";
reg = <0x22>;
- fcs,int_n = <&gpio1 RK_PA2 GPIO_ACTIVE_HIGH>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <RK_PA2 IRQ_TYPE_LEVEL_LOW>;
pinctrl-names = "default";
pinctrl-0 = <&fusb0_int_gpio>;
vbus-supply = <&vbus_typec>;
@@ -788,13 +789,13 @@
dc-charger {
dc_det_gpio: dc-det-gpio {
- rockchip,pins = <4 RK_PD0 RK_FUNC_GPIO &pcfg_pull_none>;
+ rockchip,pins = <4 RK_PD0 RK_FUNC_GPIO &pcfg_pull_up>;
};
};
es8316 {
hp_det_gpio: hp-det-gpio {
- rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_down>;
+ rockchip,pins = <0 RK_PB0 RK_FUNC_GPIO &pcfg_pull_up>;
};
};
diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
index 74f2c3d49095..1448f358ed0a 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi
@@ -403,7 +403,7 @@
reset-names = "usb3-otg";
status = "disabled";
- usbdrd_dwc3_0: dwc3 {
+ usbdrd_dwc3_0: usb@fe800000 {
compatible = "snps,dwc3";
reg = <0x0 0xfe800000 0x0 0x100000>;
interrupts = <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH 0>;
@@ -439,7 +439,7 @@
reset-names = "usb3-otg";
status = "disabled";
- usbdrd_dwc3_1: dwc3 {
+ usbdrd_dwc3_1: usb@fe900000 {
compatible = "snps,dwc3";
reg = <0x0 0xfe900000 0x0 0x100000>;
interrupts = <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH 0>;
@@ -1124,8 +1124,6 @@
pmugrf: syscon@ff320000 {
compatible = "rockchip,rk3399-pmugrf", "syscon", "simple-mfd";
reg = <0x0 0xff320000 0x0 0x1000>;
- #address-cells = <1>;
- #size-cells = <1>;
pmu_io_domains: io-domains {
compatible = "rockchip,rk3399-pmu-io-voltage-domain";
@@ -1883,10 +1881,10 @@
gpu: gpu@ff9a0000 {
compatible = "rockchip,rk3399-mali", "arm,mali-t860";
reg = <0x0 0xff9a0000 0x0 0x10000>;
- interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH 0>,
- <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH 0>,
- <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH 0>;
- interrupt-names = "gpu", "job", "mmu";
+ interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH 0>,
+ <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH 0>,
+ <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH 0>;
+ interrupt-names = "job", "mmu", "gpu";
clocks = <&cru ACLK_GPU>;
#cooling-cells = <2>;
power-domains = <&power RK3399_PD_GPU>;
diff --git a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi
index 11887c72f23a..0d533d52fcda 100644
--- a/arch/arm64/boot/dts/ti/k3-am65-main.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-am65-main.dtsi
@@ -570,6 +570,28 @@
<0x5>; /* RX_CHAN */
ti,sci-rm-range-rflow = <0x6>; /* GP RFLOW */
};
+
+ cpts@310d0000 {
+ compatible = "ti,am65-cpts";
+ reg = <0x0 0x310d0000 0x0 0x400>;
+ reg-names = "cpts";
+ clocks = <&main_cpts_mux>;
+ clock-names = "cpts";
+ interrupts-extended = <&intr_main_navss 163 0>;
+ interrupt-names = "cpts";
+ ti,cpts-periodic-outputs = <6>;
+ ti,cpts-ext-ts-inputs = <8>;
+
+ main_cpts_mux: refclk-mux {
+ #clock-cells = <0>;
+ clocks = <&k3_clks 118 5>, <&k3_clks 118 11>,
+ <&k3_clks 118 6>, <&k3_clks 118 3>,
+ <&k3_clks 118 8>, <&k3_clks 118 14>,
+ <&k3_clks 120 3>, <&k3_clks 121 3>;
+ assigned-clocks = <&main_cpts_mux>;
+ assigned-clock-parents = <&k3_clks 118 5>;
+ };
+ };
};
main_gpio0: main_gpio0@600000 {
diff --git a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi
index 353d1e2532a7..ae5f813d0cac 100644
--- a/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-am65-mcu.dtsi
@@ -247,5 +247,26 @@
clock-names = "fck";
bus_freq = <1000000>;
};
+
+ cpts@3d000 {
+ compatible = "ti,am65-cpts";
+ reg = <0x0 0x3d000 0x0 0x400>;
+ clocks = <&mcu_cpsw_cpts_mux>;
+ clock-names = "cpts";
+ interrupts-extended = <&gic500 GIC_SPI 570 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "cpts";
+ ti,cpts-ext-ts-inputs = <4>;
+ ti,cpts-periodic-outputs = <2>;
+
+ mcu_cpsw_cpts_mux: refclk-mux {
+ #clock-cells = <0>;
+ clocks = <&k3_clks 118 5>, <&k3_clks 118 11>,
+ <&k3_clks 118 6>, <&k3_clks 118 3>,
+ <&k3_clks 118 8>, <&k3_clks 118 14>,
+ <&k3_clks 120 3>, <&k3_clks 121 3>;
+ assigned-clocks = <&mcu_cpsw_cpts_mux>;
+ assigned-clock-parents = <&k3_clks 118 5>;
+ };
+ };
};
};
diff --git a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi
index 0b9d14b838a1..844a5b50cf09 100644
--- a/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-j721e-main.dtsi
@@ -254,6 +254,18 @@
<0x0c>; /* RX_UHCHAN */
ti,sci-rm-range-rflow = <0x00>; /* GP RFLOW */
};
+
+ cpts@310d0000 {
+ compatible = "ti,j721e-cpts";
+ reg = <0x0 0x310d0000 0x0 0x400>;
+ reg-names = "cpts";
+ clocks = <&k3_clks 201 1>;
+ clock-names = "cpts";
+ interrupts-extended = <&main_navss_intr 201 0>;
+ interrupt-names = "cpts";
+ ti,cpts-periodic-outputs = <6>;
+ ti,cpts-ext-ts-inputs = <8>;
+ };
};
main_pmx0: pinmux@11c000 {
diff --git a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi
index 3d6064125b40..dc31bd0434cb 100644
--- a/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-j721e-mcu-wakeup.dtsi
@@ -338,5 +338,16 @@
clock-names = "fck";
bus_freq = <1000000>;
};
+
+ cpts@3d000 {
+ compatible = "ti,am65-cpts";
+ reg = <0x0 0x3d000 0x0 0x400>;
+ clocks = <&k3_clks 18 2>;
+ clock-names = "cpts";
+ interrupts-extended = <&gic500 GIC_SPI 858 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "cpts";
+ ti,cpts-ext-ts-inputs = <4>;
+ ti,cpts-periodic-outputs = <2>;
+ };
};
};
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 24e534d85045..883e8bace3ed 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -84,6 +84,7 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y
CONFIG_CPUFREQ_DT=y
CONFIG_ACPI_CPPC_CPUFREQ=m
+CONFIG_ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM=m
CONFIG_ARM_ARMADA_37XX_CPUFREQ=y
CONFIG_ARM_SCPI_CPUFREQ=y
CONFIG_ARM_IMX_CPUFREQ_DT=m
@@ -188,6 +189,7 @@ CONFIG_NET_9P_VIRTIO=y
CONFIG_PCI=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCI_IOV=y
+CONFIG_PCI_PASID=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_ACPI=y
CONFIG_PCI_AARDVARK=y
@@ -208,7 +210,7 @@ CONFIG_PCIE_QCOM=y
CONFIG_PCIE_ARMADA_8K=y
CONFIG_PCIE_KIRIN=y
CONFIG_PCIE_HISI_STB=y
-CONFIG_PCIE_TEGRA194=m
+CONFIG_PCIE_TEGRA194_HOST=m
CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_FW_LOADER_USER_HELPER=y
@@ -241,6 +243,7 @@ CONFIG_BLK_DEV_NVME=m
CONFIG_SRAM=y
CONFIG_EEPROM_AT24=m
CONFIG_EEPROM_AT25=m
+CONFIG_UACCE=m
# CONFIG_SCSI_PROC_FS is not set
CONFIG_BLK_DEV_SD=y
CONFIG_SCSI_SAS_ATA=y
@@ -305,6 +308,7 @@ CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
CONFIG_QCOM_EMAC=m
+CONFIG_RMNET=m
CONFIG_RAVB=y
CONFIG_SMC91X=y
CONFIG_SMSC911X=y
@@ -312,6 +316,7 @@ CONFIG_SNI_AVE=y
CONFIG_SNI_NETSEC=y
CONFIG_STMMAC_ETH=m
CONFIG_TI_K3_AM65_CPSW_NUSS=y
+CONFIG_QCOM_IPA=m
CONFIG_MDIO_BUS_MUX_MMIOREG=y
CONFIG_MDIO_BUS_MUX_MULTIPLEXER=y
CONFIG_AQUANTIA_PHY=y
@@ -410,6 +415,7 @@ CONFIG_I2C_MESON=y
CONFIG_I2C_MV64XXX=y
CONFIG_I2C_OWL=y
CONFIG_I2C_PXA=y
+CONFIG_I2C_QCOM_CCI=m
CONFIG_I2C_QCOM_GENI=m
CONFIG_I2C_QUP=y
CONFIG_I2C_RK3X=y
@@ -450,6 +456,7 @@ CONFIG_PINCTRL_IMX8MN=y
CONFIG_PINCTRL_IMX8MP=y
CONFIG_PINCTRL_IMX8MQ=y
CONFIG_PINCTRL_IMX8QXP=y
+CONFIG_PINCTRL_IMX8DXL=y
CONFIG_PINCTRL_IPQ8074=y
CONFIG_PINCTRL_IPQ6018=y
CONFIG_PINCTRL_MSM8916=y
@@ -459,6 +466,7 @@ CONFIG_PINCTRL_MSM8998=y
CONFIG_PINCTRL_QCS404=y
CONFIG_PINCTRL_QDF2XXX=y
CONFIG_PINCTRL_QCOM_SPMI_PMIC=y
+CONFIG_PINCTRL_SC7180=y
CONFIG_PINCTRL_SDM845=y
CONFIG_PINCTRL_SM8150=y
CONFIG_GPIO_ALTERA=m
@@ -513,6 +521,7 @@ CONFIG_UNIPHIER_THERMAL=y
CONFIG_WATCHDOG=y
CONFIG_ARM_SP805_WATCHDOG=y
CONFIG_ARM_SBSA_WATCHDOG=y
+CONFIG_ARM_SMC_WATCHDOG=y
CONFIG_S3C2410_WATCHDOG=y
CONFIG_DW_WATCHDOG=y
CONFIG_SUNXI_WATCHDOG=m
@@ -567,6 +576,7 @@ CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y
CONFIG_MEDIA_SDR_SUPPORT=y
CONFIG_MEDIA_CONTROLLER=y
CONFIG_VIDEO_V4L2_SUBDEV_API=y
+CONFIG_MEDIA_PLATFORM_SUPPORT=y
# CONFIG_DVB_NET is not set
CONFIG_MEDIA_USB_SUPPORT=y
CONFIG_USB_VIDEO_CLASS=m
@@ -583,6 +593,7 @@ CONFIG_VIDEO_RENESAS_FCP=m
CONFIG_VIDEO_RENESAS_VSP1=m
CONFIG_SDR_PLATFORM_DRIVERS=y
CONFIG_VIDEO_RCAR_DRIF=m
+CONFIG_VIDEO_QCOM_CAMSS=m
CONFIG_DRM=m
CONFIG_DRM_I2C_NXP_TDA998X=m
CONFIG_DRM_MALI_DISPLAY=m
@@ -610,8 +621,9 @@ CONFIG_DRM_MSM=m
CONFIG_DRM_TEGRA=m
CONFIG_DRM_PANEL_LVDS=m
CONFIG_DRM_PANEL_SIMPLE=m
-CONFIG_DRM_DUMB_VGA_DAC=m
+CONFIG_DRM_SIMPLE_BRIDGE=m
CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m
+CONFIG_DRM_DISPLAY_CONNECTOR=m
CONFIG_DRM_SII902X=m
CONFIG_DRM_THINE_THC63LVD1024=m
CONFIG_DRM_TI_SN65DSI86=m
@@ -642,6 +654,7 @@ CONFIG_SND_HDA_CODEC_HDMI=m
CONFIG_SND_SOC=y
CONFIG_SND_BCM2835_SOC_I2S=m
CONFIG_SND_MESON_AXG_SOUND_CARD=m
+CONFIG_SND_MESON_GX_SOUND_CARD=m
CONFIG_SND_SOC_SDM845=m
CONFIG_SND_SOC_ROCKCHIP=m
CONFIG_SND_SOC_ROCKCHIP_SPDIF=m
@@ -654,6 +667,7 @@ CONFIG_SND_SOC_AK4613=m
CONFIG_SND_SOC_ES7134=m
CONFIG_SND_SOC_ES7241=m
CONFIG_SND_SOC_PCM3168A_I2C=m
+CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m
CONFIG_SND_SOC_TAS571X=m
CONFIG_SND_SOC_WCD934X=m
CONFIG_SND_SOC_WSA881X=m
@@ -723,6 +737,7 @@ CONFIG_LEDS_CLASS=y
CONFIG_LEDS_GPIO=y
CONFIG_LEDS_PWM=y
CONFIG_LEDS_SYSCON=y
+CONFIG_LEDS_TRIGGER_TIMER=y
CONFIG_LEDS_TRIGGER_DISK=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y
CONFIG_LEDS_TRIGGER_CPU=y
@@ -802,10 +817,13 @@ CONFIG_MSM_GCC_8994=y
CONFIG_MSM_MMCC_8996=y
CONFIG_MSM_GCC_8998=y
CONFIG_QCS_GCC_404=y
+CONFIG_SC_GCC_7180=y
+CONFIG_SDM_CAMCC_845=m
CONFIG_SDM_GCC_845=y
CONFIG_SDM_GPUCC_845=y
CONFIG_SDM_DISPCC_845=y
CONFIG_SM_GCC_8150=y
+CONFIG_SM_GCC_8250=y
CONFIG_QCOM_HFPLL=y
CONFIG_HWSPINLOCK=y
CONFIG_HWSPINLOCK_QCOM=y
@@ -835,7 +853,6 @@ CONFIG_FSL_MC_DPIO=y
CONFIG_IMX_SCU_SOC=y
CONFIG_QCOM_AOSS_QMP=y
CONFIG_QCOM_GENI_SE=y
-CONFIG_QCOM_GLINK_SSR=m
CONFIG_QCOM_RMTFS_MEM=m
CONFIG_QCOM_RPMH=y
CONFIG_QCOM_RPMHPD=y
@@ -848,7 +865,8 @@ CONFIG_QCOM_APR=m
CONFIG_ARCH_R8A774A1=y
CONFIG_ARCH_R8A774B1=y
CONFIG_ARCH_R8A774C0=y
-CONFIG_ARCH_R8A7795=y
+CONFIG_ARCH_R8A77950=y
+CONFIG_ARCH_R8A77951=y
CONFIG_ARCH_R8A77960=y
CONFIG_ARCH_R8A77961=y
CONFIG_ARCH_R8A77965=y
@@ -968,7 +986,9 @@ CONFIG_CRYPTO_DEV_FSL_CAAM=m
CONFIG_CRYPTO_DEV_FSL_DPAA2_CAAM=m
CONFIG_CRYPTO_DEV_QCOM_RNG=m
CONFIG_CRYPTO_DEV_CCREE=m
+CONFIG_CRYPTO_DEV_HISI_SEC2=m
CONFIG_CRYPTO_DEV_HISI_ZIP=m
+CONFIG_CRYPTO_DEV_HISI_HPRE=m
CONFIG_CMA_SIZE_MBYTES=32
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index ed5409c6abf4..395bbf64b2ab 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -158,7 +158,6 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
unsigned int key_len)
{
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
- SHASH_DESC_ON_STACK(desc, ctx->hash);
u8 digest[SHA256_DIGEST_SIZE];
int ret;
@@ -166,8 +165,7 @@ static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
if (ret)
return ret;
- desc->tfm = ctx->hash;
- crypto_shash_digest(desc, in_key, key_len, digest);
+ crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest);
return aes_expandkey(&ctx->key2, digest, sizeof(digest));
}
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
index 37ca3e889848..af2bbca38e70 100644
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -87,9 +87,17 @@ void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
!crypto_simd_usable())
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
- kernel_neon_begin();
- chacha_doneon(state, dst, src, bytes, nrounds);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 5a95c2628fbf..111d9c9abddd 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -66,7 +66,7 @@
#include <asm/assembler.h>
.text
- .cpu generic+crypto
+ .arch armv8-a+crypto
init_crc .req w19
buf .req x20
diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c
index 895d3727c1fb..c5405e6a6db7 100644
--- a/arch/arm64/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@@ -30,7 +30,7 @@ static int nhpoly1305_neon_update(struct shash_desc *desc,
return crypto_nhpoly1305_update(desc, src, srclen);
do {
- unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+ unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_neon_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
index e97b092f56b8..f33ada70c4ed 100644
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -143,13 +143,20 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
- kernel_neon_begin();
- poly1305_blocks_neon(&dctx->h, src, len, 1);
- kernel_neon_end();
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
} else {
poly1305_blocks(&dctx->h, src, len, 1);
+ src += len;
}
- src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
index ddf4a0d85c1c..77bc6e72abae 100644
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@@ -12,7 +12,6 @@
#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index 78d3083de6b7..370ccb29602f 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/internal/hash.h>
-#include <linux/cryptohash.h>
#include <linux/types.h>
#include <linux/string.h>
#include <crypto/sha.h>
diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h
index ce2a8486992b..52dead2a8640 100644
--- a/arch/arm64/include/asm/asm_pointer_auth.h
+++ b/arch/arm64/include/asm/asm_pointer_auth.h
@@ -39,25 +39,58 @@ alternative_if ARM64_HAS_GENERIC_AUTH
alternative_else_nop_endif
.endm
- .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3
-alternative_if ARM64_HAS_ADDRESS_AUTH
+ .macro __ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
mov \tmp1, #THREAD_KEYS_KERNEL
add \tmp1, \tsk, \tmp1
ldp \tmp2, \tmp3, [\tmp1, #PTRAUTH_KERNEL_KEY_APIA]
msr_s SYS_APIAKEYLO_EL1, \tmp2
msr_s SYS_APIAKEYHI_EL1, \tmp3
- .if \sync == 1
+ .endm
+
+ .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+alternative_if ARM64_HAS_ADDRESS_AUTH
+ __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3
+alternative_else_nop_endif
+ .endm
+
+ .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
+alternative_if ARM64_HAS_ADDRESS_AUTH
+ __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3
isb
- .endif
alternative_else_nop_endif
.endm
+ .macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3
+ mrs \tmp1, id_aa64isar1_el1
+ ubfx \tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8
+ cbz \tmp1, .Lno_addr_auth\@
+ mov_q \tmp1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+ SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
+ mrs \tmp2, sctlr_el1
+ orr \tmp2, \tmp2, \tmp1
+ msr sctlr_el1, \tmp2
+ __ptrauth_keys_install_kernel_nosync \tsk, \tmp1, \tmp2, \tmp3
+ isb
+.Lno_addr_auth\@:
+ .endm
+
+ .macro ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3
+alternative_if_not ARM64_HAS_ADDRESS_AUTH
+ b .Lno_addr_auth\@
+alternative_else_nop_endif
+ __ptrauth_keys_init_cpu \tsk, \tmp1, \tmp2, \tmp3
+.Lno_addr_auth\@:
+ .endm
+
#else /* CONFIG_ARM64_PTR_AUTH */
.macro ptrauth_keys_install_user tsk, tmp1, tmp2, tmp3
.endm
- .macro ptrauth_keys_install_kernel tsk, sync, tmp1, tmp2, tmp3
+ .macro ptrauth_keys_install_kernel_nosync tsk, tmp1, tmp2, tmp3
+ .endm
+
+ .macro ptrauth_keys_install_kernel tsk, tmp1, tmp2, tmp3
.endm
#endif /* CONFIG_ARM64_PTR_AUTH */
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 0bff325117b4..54d181177656 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -736,4 +736,54 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU
.Lyield_out_\@ :
.endm
+/*
+ * This macro emits a program property note section identifying
+ * architecture features which require special handling, mainly for
+ * use in assembly files included in the VDSO.
+ */
+
+#define NT_GNU_PROPERTY_TYPE_0 5
+#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
+
+#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
+#define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
+
+#ifdef CONFIG_ARM64_BTI_KERNEL
+#define GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT \
+ ((GNU_PROPERTY_AARCH64_FEATURE_1_BTI | \
+ GNU_PROPERTY_AARCH64_FEATURE_1_PAC))
+#endif
+
+#ifdef GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT
+.macro emit_aarch64_feature_1_and, feat=GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT
+ .pushsection .note.gnu.property, "a"
+ .align 3
+ .long 2f - 1f
+ .long 6f - 3f
+ .long NT_GNU_PROPERTY_TYPE_0
+1: .string "GNU"
+2:
+ .align 3
+3: .long GNU_PROPERTY_AARCH64_FEATURE_1_AND
+ .long 5f - 4f
+4:
+ /*
+ * This is described with an array of char in the Linux API
+ * spec but the text and all other usage (including binutils,
+ * clang and GCC) treat this as a 32 bit value so no swizzling
+ * is required for big endian.
+ */
+ .long \feat
+5:
+ .align 3
+6:
+ .popsection
+.endm
+
+#else
+.macro emit_aarch64_feature_1_and, feat=0
+.endm
+
+#endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */
+
#endif /* __ASM_ASSEMBLER_H */
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index e6cca3d4acf7..ce50c1f1f1ea 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -79,7 +79,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end)
* IPI all online CPUs so that they undergo a context synchronization
* event and are forced to refetch the new instructions.
*/
-#ifdef CONFIG_KGDB
+
/*
* KGDB performs cache maintenance with interrupts disabled, so we
* will deadlock trying to IPI the secondary CPUs. In theory, we can
@@ -89,9 +89,9 @@ static inline void flush_icache_range(unsigned long start, unsigned long end)
* the patching operation, so we don't need extra IPIs here anyway.
* In which case, add a KGDB-specific bodge and return early.
*/
- if (kgdb_connected && irqs_disabled())
+ if (in_dbg_master())
return;
-#endif
+
kick_all_cpus_sync();
}
diff --git a/arch/arm64/include/asm/compiler.h b/arch/arm64/include/asm/compiler.h
index eece20d2c55f..51a7ce87cdfe 100644
--- a/arch/arm64/include/asm/compiler.h
+++ b/arch/arm64/include/asm/compiler.h
@@ -2,8 +2,6 @@
#ifndef __ASM_COMPILER_H
#define __ASM_COMPILER_H
-#if defined(CONFIG_ARM64_PTR_AUTH)
-
/*
* The EL0/EL1 pointer bits used by a pointer authentication code.
* This is dependent on TBI0/TBI1 being enabled, or bits 63:56 would also apply.
@@ -19,6 +17,4 @@
#define __builtin_return_address(val) \
(void *)(ptrauth_clear_pac((unsigned long)__builtin_return_address(val)))
-#endif /* CONFIG_ARM64_PTR_AUTH */
-
#endif /* __ASM_COMPILER_H */
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index b4a40535a3d8..7faae6ff3ab4 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -33,6 +33,7 @@ struct cpuinfo_arm64 {
u64 reg_id_aa64zfr0;
u32 reg_id_dfr0;
+ u32 reg_id_dfr1;
u32 reg_id_isar0;
u32 reg_id_isar1;
u32 reg_id_isar2;
@@ -44,8 +45,11 @@ struct cpuinfo_arm64 {
u32 reg_id_mmfr1;
u32 reg_id_mmfr2;
u32 reg_id_mmfr3;
+ u32 reg_id_mmfr4;
+ u32 reg_id_mmfr5;
u32 reg_id_pfr0;
u32 reg_id_pfr1;
+ u32 reg_id_pfr2;
u32 reg_mvfr0;
u32 reg_mvfr1;
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 8eb5a088ae65..d7b3bb0cb180 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -44,7 +44,7 @@
#define ARM64_SSBS 34
#define ARM64_WORKAROUND_1418040 35
#define ARM64_HAS_SB 36
-#define ARM64_WORKAROUND_SPECULATIVE_AT_VHE 37
+#define ARM64_WORKAROUND_SPECULATIVE_AT 37
#define ARM64_HAS_ADDRESS_AUTH_ARCH 38
#define ARM64_HAS_ADDRESS_AUTH_IMP_DEF 39
#define ARM64_HAS_GENERIC_AUTH_ARCH 40
@@ -55,13 +55,14 @@
#define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM 45
#define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM 46
#define ARM64_WORKAROUND_1542419 47
-#define ARM64_WORKAROUND_SPECULATIVE_AT_NVHE 48
-#define ARM64_HAS_E0PD 49
-#define ARM64_HAS_RNG 50
-#define ARM64_HAS_AMU_EXTN 51
-#define ARM64_HAS_ADDRESS_AUTH 52
-#define ARM64_HAS_GENERIC_AUTH 53
+#define ARM64_HAS_E0PD 48
+#define ARM64_HAS_RNG 49
+#define ARM64_HAS_AMU_EXTN 50
+#define ARM64_HAS_ADDRESS_AUTH 51
+#define ARM64_HAS_GENERIC_AUTH 52
+#define ARM64_HAS_32BIT_EL1 53
+#define ARM64_BTI 54
-#define ARM64_NCAPS 54
+#define ARM64_NCAPS 55
#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index afe08251ff95..5d1f4ae42799 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -551,6 +551,13 @@ static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_BIGENDEL0_SHIFT) == 0x1;
}
+static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
+{
+ u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT);
+
+ return val == ID_AA64PFR0_EL1_32BIT_64BIT;
+}
+
static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
{
u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT);
@@ -680,6 +687,11 @@ static inline bool system_has_prio_mask_debugging(void)
system_uses_irq_prio_masking();
}
+static inline bool system_supports_bti(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI);
+}
+
#define ARM64_BP_HARDEN_UNKNOWN -1
#define ARM64_BP_HARDEN_WA_NEEDED 0
#define ARM64_BP_HARDEN_NOT_REQUIRED 1
@@ -745,6 +757,24 @@ static inline bool cpu_has_hw_af(void)
extern bool cpu_has_amu_feat(int cpu);
#endif
+static inline unsigned int get_vmid_bits(u64 mmfr1)
+{
+ int vmid_bits;
+
+ vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1,
+ ID_AA64MMFR1_VMIDBITS_SHIFT);
+ if (vmid_bits == ID_AA64MMFR1_VMIDBITS_16)
+ return 16;
+
+ /*
+ * Return the default here even if any reserved
+ * value is fetched from the system register.
+ */
+ return 8;
+}
+
+u32 get_kvm_ipa_limit(void);
+
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 7619f473155f..e5ceea213e39 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -125,5 +125,7 @@ static inline int reinstall_suspended_bps(struct pt_regs *regs)
int aarch32_break_handler(struct pt_regs *regs);
+void debug_traps_init(void);
+
#endif /* __ASSEMBLY */
#endif /* __ASM_DEBUG_MONITORS_H */
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index 45e821222774..d4ab3f73e7a3 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -86,14 +86,6 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS_MIN - 1));
}
-#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__)
-#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__)
-#define efi_is_native() (true)
-
-#define efi_table_attr(inst, attr) (inst->attr)
-
-#define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__)
-
#define alloc_screen_info(x...) &screen_info
static inline void free_screen_info(struct screen_info *si)
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index b618017205a3..4f00d50585a4 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -114,7 +114,11 @@
#ifndef __ASSEMBLY__
+#include <uapi/linux/elf.h>
#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/types.h>
#include <asm/processor.h> /* for signal_minsigstksz, used by ARCH_DLINFO */
typedef unsigned long elf_greg_t;
@@ -224,6 +228,52 @@ extern int aarch32_setup_additional_pages(struct linux_binprm *bprm,
#endif /* CONFIG_COMPAT */
+struct arch_elf_state {
+ int flags;
+};
+
+#define ARM64_ELF_BTI (1 << 0)
+
+#define INIT_ARCH_ELF_STATE { \
+ .flags = 0, \
+}
+
+static inline int arch_parse_elf_property(u32 type, const void *data,
+ size_t datasz, bool compat,
+ struct arch_elf_state *arch)
+{
+ /* No known properties for AArch32 yet */
+ if (IS_ENABLED(CONFIG_COMPAT) && compat)
+ return 0;
+
+ if (type == GNU_PROPERTY_AARCH64_FEATURE_1_AND) {
+ const u32 *p = data;
+
+ if (datasz != sizeof(*p))
+ return -ENOEXEC;
+
+ if (system_supports_bti() &&
+ (*p & GNU_PROPERTY_AARCH64_FEATURE_1_BTI))
+ arch->flags |= ARM64_ELF_BTI;
+ }
+
+ return 0;
+}
+
+static inline int arch_elf_pt_proc(void *ehdr, void *phdr,
+ struct file *f, bool is_interp,
+ struct arch_elf_state *state)
+{
+ return 0;
+}
+
+static inline int arch_check_elf(void *ehdr, bool has_interp,
+ void *interp_ehdr,
+ struct arch_elf_state *state)
+{
+ return 0;
+}
+
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index 6a395a7e6707..035003acfa87 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -22,7 +22,7 @@
#define ESR_ELx_EC_PAC (0x09) /* EL2 and above */
/* Unallocated EC: 0x0A - 0x0B */
#define ESR_ELx_EC_CP14_64 (0x0C)
-/* Unallocated EC: 0x0d */
+#define ESR_ELx_EC_BTI (0x0D)
#define ESR_ELx_EC_ILL (0x0E)
/* Unallocated EC: 0x0F - 0x10 */
#define ESR_ELx_EC_SVC32 (0x11)
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..7577a754d443 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -34,6 +34,7 @@ static inline u32 disr_to_esr(u64 disr)
asmlinkage void enter_from_user_mode(void);
void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs);
void do_undefinstr(struct pt_regs *regs);
+void do_bti(struct pt_regs *regs);
asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr);
void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h
index 87ad961f3c97..985493af704b 100644
--- a/arch/arm64/include/asm/hardirq.h
+++ b/arch/arm64/include/asm/hardirq.h
@@ -32,30 +32,70 @@ u64 smp_irq_stat_cpu(unsigned int cpu);
struct nmi_ctx {
u64 hcr;
+ unsigned int cnt;
};
DECLARE_PER_CPU(struct nmi_ctx, nmi_contexts);
-#define arch_nmi_enter() \
- do { \
- if (is_kernel_in_hyp_mode()) { \
- struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \
- nmi_ctx->hcr = read_sysreg(hcr_el2); \
- if (!(nmi_ctx->hcr & HCR_TGE)) { \
- write_sysreg(nmi_ctx->hcr | HCR_TGE, hcr_el2); \
- isb(); \
- } \
- } \
- } while (0)
+#define arch_nmi_enter() \
+do { \
+ struct nmi_ctx *___ctx; \
+ u64 ___hcr; \
+ \
+ if (!is_kernel_in_hyp_mode()) \
+ break; \
+ \
+ ___ctx = this_cpu_ptr(&nmi_contexts); \
+ if (___ctx->cnt) { \
+ ___ctx->cnt++; \
+ break; \
+ } \
+ \
+ ___hcr = read_sysreg(hcr_el2); \
+ if (!(___hcr & HCR_TGE)) { \
+ write_sysreg(___hcr | HCR_TGE, hcr_el2); \
+ isb(); \
+ } \
+ /* \
+ * Make sure the sysreg write is performed before ___ctx->cnt \
+ * is set to 1. NMIs that see cnt == 1 will rely on us. \
+ */ \
+ barrier(); \
+ ___ctx->cnt = 1; \
+ /* \
+ * Make sure ___ctx->cnt is set before we save ___hcr. We \
+ * don't want ___ctx->hcr to be overwritten. \
+ */ \
+ barrier(); \
+ ___ctx->hcr = ___hcr; \
+} while (0)
-#define arch_nmi_exit() \
- do { \
- if (is_kernel_in_hyp_mode()) { \
- struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \
- if (!(nmi_ctx->hcr & HCR_TGE)) \
- write_sysreg(nmi_ctx->hcr, hcr_el2); \
- } \
- } while (0)
+#define arch_nmi_exit() \
+do { \
+ struct nmi_ctx *___ctx; \
+ u64 ___hcr; \
+ \
+ if (!is_kernel_in_hyp_mode()) \
+ break; \
+ \
+ ___ctx = this_cpu_ptr(&nmi_contexts); \
+ ___hcr = ___ctx->hcr; \
+ /* \
+ * Make sure we read ___ctx->hcr before we release \
+ * ___ctx->cnt as it makes ___ctx->hcr updatable again. \
+ */ \
+ barrier(); \
+ ___ctx->cnt--; \
+ /* \
+ * Make sure ___ctx->cnt release is visible before we \
+ * restore the sysreg. Otherwise a new NMI occurring \
+ * right after write_sysreg() can be fooled and think \
+ * we secured things for it. \
+ */ \
+ barrier(); \
+ if (!___ctx->cnt && !(___hcr & HCR_TGE)) \
+ write_sysreg(___hcr, hcr_el2); \
+} while (0)
static inline void ack_bad_irq(unsigned int irq)
{
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2eb6c234d594..94ba0c5bced2 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -17,22 +17,11 @@
extern bool arch_hugetlb_migration_supported(struct hstate *h);
#endif
-#define __HAVE_ARCH_HUGE_PTEP_GET
-static inline pte_t huge_ptep_get(pte_t *ptep)
-{
- return READ_ONCE(*ptep);
-}
-
-static inline int is_hugepage_only_range(struct mm_struct *mm,
- unsigned long addr, unsigned long len)
-{
- return 0;
-}
-
static inline void arch_clear_hugepage_flags(struct page *page)
{
clear_bit(PG_dcache_clean, &page->flags);
}
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
struct page *page, int writable);
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 0f00265248b5..d683bcbf1e7c 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -94,6 +94,7 @@
#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16)
#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH)
#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG)
+#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI)
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index bb313dde58a4..0bc46149e491 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -39,13 +39,37 @@ enum aarch64_insn_encoding_class {
* system instructions */
};
-enum aarch64_insn_hint_op {
+enum aarch64_insn_hint_cr_op {
AARCH64_INSN_HINT_NOP = 0x0 << 5,
AARCH64_INSN_HINT_YIELD = 0x1 << 5,
AARCH64_INSN_HINT_WFE = 0x2 << 5,
AARCH64_INSN_HINT_WFI = 0x3 << 5,
AARCH64_INSN_HINT_SEV = 0x4 << 5,
AARCH64_INSN_HINT_SEVL = 0x5 << 5,
+
+ AARCH64_INSN_HINT_XPACLRI = 0x07 << 5,
+ AARCH64_INSN_HINT_PACIA_1716 = 0x08 << 5,
+ AARCH64_INSN_HINT_PACIB_1716 = 0x0A << 5,
+ AARCH64_INSN_HINT_AUTIA_1716 = 0x0C << 5,
+ AARCH64_INSN_HINT_AUTIB_1716 = 0x0E << 5,
+ AARCH64_INSN_HINT_PACIAZ = 0x18 << 5,
+ AARCH64_INSN_HINT_PACIASP = 0x19 << 5,
+ AARCH64_INSN_HINT_PACIBZ = 0x1A << 5,
+ AARCH64_INSN_HINT_PACIBSP = 0x1B << 5,
+ AARCH64_INSN_HINT_AUTIAZ = 0x1C << 5,
+ AARCH64_INSN_HINT_AUTIASP = 0x1D << 5,
+ AARCH64_INSN_HINT_AUTIBZ = 0x1E << 5,
+ AARCH64_INSN_HINT_AUTIBSP = 0x1F << 5,
+
+ AARCH64_INSN_HINT_ESB = 0x10 << 5,
+ AARCH64_INSN_HINT_PSB = 0x11 << 5,
+ AARCH64_INSN_HINT_TSB = 0x12 << 5,
+ AARCH64_INSN_HINT_CSDB = 0x14 << 5,
+
+ AARCH64_INSN_HINT_BTI = 0x20 << 5,
+ AARCH64_INSN_HINT_BTIC = 0x22 << 5,
+ AARCH64_INSN_HINT_BTIJ = 0x24 << 5,
+ AARCH64_INSN_HINT_BTIJC = 0x26 << 5,
};
enum aarch64_insn_imm_type {
@@ -344,7 +368,7 @@ __AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000)
#undef __AARCH64_INSN_FUNCS
-bool aarch64_insn_is_nop(u32 insn);
+bool aarch64_insn_is_steppable_hint(u32 insn);
bool aarch64_insn_is_branch_imm(u32 insn);
static inline bool aarch64_insn_is_adr_adrp(u32 insn)
@@ -370,7 +394,7 @@ u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
enum aarch64_insn_branch_type type);
u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
enum aarch64_insn_condition cond);
-u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_op op);
+u32 aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op);
u32 aarch64_insn_gen_nop(void);
u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
enum aarch64_insn_branch_type type);
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 7c7eeeaab9fa..0c9b5fc4ba0a 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -64,12 +64,14 @@ extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu);
-extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high);
+extern void __kvm_timer_set_cntvoff(u64 cntvoff);
extern int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu);
extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu);
+extern void __kvm_enable_ssbs(void);
+
extern u64 __vgic_v3_get_ich_vtr_el2(void);
extern u64 __vgic_v3_read_vmcr(void);
extern void __vgic_v3_write_vmcr(u32 vmcr);
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index a30b4eec7cb4..6ea53e6e8b26 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -507,10 +507,12 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
static __always_inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
{
- if (vcpu_mode_is_32bit(vcpu))
+ if (vcpu_mode_is_32bit(vcpu)) {
kvm_skip_instr32(vcpu, is_wide_instr);
- else
+ } else {
*vcpu_pc(vcpu) += 4;
+ *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK;
+ }
/* advance the singlestep state machine */
*vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 32c8a675e5a4..abbdf9703e20 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -46,6 +46,9 @@
#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3)
#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4)
+#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
+ KVM_DIRTY_LOG_INITIALLY_SET)
+
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
extern unsigned int kvm_sve_max_vl;
@@ -112,12 +115,8 @@ struct kvm_vcpu_fault_info {
u64 disr_el1; /* Deferred [SError] Status Register */
};
-/*
- * 0 is reserved as an invalid value.
- * Order should be kept in sync with the save/restore code.
- */
enum vcpu_sysreg {
- __INVALID_SYSREG__,
+ __INVALID_SYSREG__, /* 0 is reserved as an invalid value */
MPIDR_EL1, /* MultiProcessor Affinity Register */
CSSELR_EL1, /* Cache Size Selection Register */
SCTLR_EL1, /* System Control Register */
@@ -415,6 +414,8 @@ struct kvm_vm_stat {
struct kvm_vcpu_stat {
u64 halt_successful_poll;
u64 halt_attempted_poll;
+ u64 halt_poll_success_ns;
+ u64 halt_poll_fail_ns;
u64 halt_poll_invalid;
u64 halt_wakeup;
u64 hvc_exit_stat;
@@ -530,39 +531,6 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
cpu_ctxt->sys_regs[MPIDR_EL1] = read_cpuid_mpidr();
}
-void __kvm_enable_ssbs(void);
-
-static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
- unsigned long hyp_stack_ptr,
- unsigned long vector_ptr)
-{
- /*
- * Calculate the raw per-cpu offset without a translation from the
- * kernel's mapping to the linear mapping, and store it in tpidr_el2
- * so that we can use adr_l to access per-cpu variables in EL2.
- */
- u64 tpidr_el2 = ((u64)this_cpu_ptr(&kvm_host_data) -
- (u64)kvm_ksym_ref(kvm_host_data));
-
- /*
- * Call initialization code, and switch to the full blown HYP code.
- * If the cpucaps haven't been finalized yet, something has gone very
- * wrong, and hyp will crash and burn when it uses any
- * cpus_have_const_cap() wrapper.
- */
- BUG_ON(!system_capabilities_finalized());
- __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
-
- /*
- * Disabling SSBD on a non-VHE system requires us to enable SSBS
- * at EL2.
- */
- if (!has_vhe() && this_cpu_has_cap(ARM64_SSBS) &&
- arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
- kvm_call_hyp(__kvm_enable_ssbs);
- }
-}
-
static inline bool kvm_arch_requires_vhe(void)
{
/*
@@ -573,10 +541,6 @@ static inline bool kvm_arch_requires_vhe(void)
if (system_supports_sve())
return true;
- /* Some implementations have defects that confine them to VHE */
- if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE))
- return true;
-
return false;
}
@@ -598,8 +562,6 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
-static inline void __cpu_init_stage2(void) {}
-
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
@@ -670,7 +632,7 @@ static inline int kvm_arm_have_ssbd(void)
void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
-void kvm_set_ipa_limit(void);
+int kvm_set_ipa_limit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index fe57f60f06a8..ce3080834bfa 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -10,10 +10,9 @@
#include <linux/compiler.h>
#include <linux/kvm_host.h>
#include <asm/alternative.h>
-#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>
-#define __hyp_text __section(.hyp.text) notrace
+#define __hyp_text __section(.hyp.text) notrace __noscs
#define read_sysreg_elx(r,nvh,vh) \
({ \
@@ -56,12 +55,12 @@
int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu);
-void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
-void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
-void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu);
-void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu);
-void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu);
-void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu);
+void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
void __timer_enable_traps(struct kvm_vcpu *vcpu);
@@ -88,22 +87,5 @@ void deactivate_traps_vhe_put(void);
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
void __noreturn __hyp_do_panic(unsigned long, ...);
-/*
- * Must be called from hyp code running at EL2 with an updated VTTBR
- * and interrupts disabled.
- */
-static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
-{
- write_sysreg(kvm->arch.vtcr, vtcr_el2);
- write_sysreg(kvm_get_vttbr(kvm), vttbr_el2);
-
- /*
- * ARM errata 1165522 and 1530923 require the actual execution of the
- * above before we can switch to the EL1/EL0 translation regime used by
- * the guest.
- */
- asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE));
-}
-
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 30b0e8d6b895..f1a74163d764 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -172,8 +172,8 @@ void kvm_clear_hyp_idmap(void);
__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
#define kvm_mk_pud(pmdp) \
__pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_pgd(pudp) \
- __pgd(__phys_to_pgd_val(__pa(pudp)) | PUD_TYPE_TABLE)
+#define kvm_mk_p4d(pmdp) \
+ __p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
#define kvm_set_pud(pudp, pud) set_pud(pudp, pud)
@@ -299,6 +299,12 @@ static inline bool kvm_s2pud_young(pud_t pud)
#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
#endif
+#ifdef __PAGETABLE_P4D_FOLDED
+#define hyp_p4d_table_empty(p4dp) (0)
+#else
+#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
+#endif
+
struct kvm;
#define kvm_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l))
@@ -363,8 +369,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
}
}
-#define kvm_virt_to_phys(x) __pa_symbol(x)
-
void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
@@ -416,7 +420,7 @@ static inline unsigned int kvm_get_vmid_bits(void)
{
int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
- return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
+ return get_vmid_bits(reg);
}
/*
@@ -473,7 +477,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
extern void *__kvm_bp_vect_base;
extern int __kvm_harden_el2_vector_slot;
-/* This is only called on a VHE system */
+/* This is called on both VHE and !VHE systems */
static inline void *kvm_get_hyp_vector(void)
{
struct bp_hardening_data *data = arm64_get_bp_hardening_data();
@@ -604,5 +608,22 @@ static __always_inline u64 kvm_get_vttbr(struct kvm *kvm)
return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
}
+/*
+ * Must be called from hyp code running at EL2 with an updated VTTBR
+ * and interrupts disabled.
+ */
+static __always_inline void __load_guest_stage2(struct kvm *kvm)
+{
+ write_sysreg(kvm->arch.vtcr, vtcr_el2);
+ write_sysreg(kvm_get_vttbr(kvm), vttbr_el2);
+
+ /*
+ * ARM errata 1165522 and 1530923 require the actual execution of the
+ * above before we can switch to the EL1/EL0 translation regime used by
+ * the guest.
+ */
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index ebee3113a62f..81fefd2a1d02 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -4,6 +4,52 @@
#define __ALIGN .align 2
#define __ALIGN_STR ".align 2"
+#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__)
+
+/*
+ * Since current versions of gas reject the BTI instruction unless we
+ * set the architecture version to v8.5 we use the hint instruction
+ * instead.
+ */
+#define BTI_C hint 34 ;
+#define BTI_J hint 36 ;
+
+/*
+ * When using in-kernel BTI we need to ensure that PCS-conformant assembly
+ * functions have suitable annotations. Override SYM_FUNC_START to insert
+ * a BTI landing pad at the start of everything.
+ */
+#define SYM_FUNC_START(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ BTI_C
+
+#define SYM_FUNC_START_NOALIGN(name) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE) \
+ BTI_C
+
+#define SYM_FUNC_START_LOCAL(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
+ BTI_C
+
+#define SYM_FUNC_START_LOCAL_NOALIGN(name) \
+ SYM_START(name, SYM_L_LOCAL, SYM_A_NONE) \
+ BTI_C
+
+#define SYM_FUNC_START_WEAK(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
+ BTI_C
+
+#define SYM_FUNC_START_WEAK_NOALIGN(name) \
+ SYM_START(name, SYM_L_WEAK, SYM_A_NONE) \
+ BTI_C
+
+#define SYM_INNER_LABEL(name, linkage) \
+ .type name SYM_T_NONE ASM_NL \
+ SYM_ENTRY(name, linkage, SYM_A_NONE) \
+ BTI_J
+
+#endif
+
/*
* Annotate a function as position independent, i.e., safe to be called before
* the kernel virtual mapping is activated.
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
new file mode 100644
index 000000000000..081ec8de9ea6
--- /dev/null
+++ b/arch/arm64/include/asm/mman.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_MMAN_H__
+#define __ASM_MMAN_H__
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <uapi/asm/mman.h>
+
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+ unsigned long pkey __always_unused)
+{
+ if (system_supports_bti() && (prot & PROT_BTI))
+ return VM_ARM64_BTI;
+
+ return 0;
+}
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+
+static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
+{
+ return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0);
+}
+#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
+
+static inline bool arch_validate_prot(unsigned long prot,
+ unsigned long addr __always_unused)
+{
+ unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM;
+
+ if (system_supports_bti())
+ supported |= PROT_BTI;
+
+ return (prot & ~supported) == 0;
+}
+#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
+
+#endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 172d76fa0245..58e93583ddb6 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -73,17 +73,17 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pudp)
free_page((unsigned long)pudp);
}
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
+static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
{
- set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot));
+ set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
}
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
{
- __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE);
+ __p4d_populate(p4dp, __pa(pudp), PUD_TYPE_TABLE);
}
#else
-static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot)
+static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
{
BUILD_BUG();
}
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 6bf5e650da78..9c91a8f93a0e 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -151,6 +151,7 @@
#define PTE_SHARED (_AT(pteval_t, 3) << 8) /* SH[1:0], inner shareable */
#define PTE_AF (_AT(pteval_t, 1) << 10) /* Access Flag */
#define PTE_NG (_AT(pteval_t, 1) << 11) /* nG */
+#define PTE_GP (_AT(pteval_t, 1) << 50) /* BTI guarded */
#define PTE_DBM (_AT(pteval_t, 1) << 51) /* Dirty Bit Management */
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
@@ -190,7 +191,6 @@
* Memory Attribute override for Stage-2 (MemAttr[3:0])
*/
#define PTE_S2_MEMATTR(t) (_AT(pteval_t, (t)) << 2)
-#define PTE_S2_MEMATTR_MASK (_AT(pteval_t, 0xf) << 2)
/*
* EL2/HYP PTE/PMD definitions
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 1305e28225fc..2e7e0f452301 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -21,6 +21,7 @@
#ifndef __ASSEMBLY__
+#include <asm/cpufeature.h>
#include <asm/pgtable-types.h>
extern bool arm64_use_ng_mappings;
@@ -31,6 +32,16 @@ extern bool arm64_use_ng_mappings;
#define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0)
#define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0)
+/*
+ * If we have userspace only BTI we don't want to mark kernel pages
+ * guarded even if the system does support BTI.
+ */
+#ifdef CONFIG_ARM64_BTI_KERNEL
+#define PTE_MAYBE_GP (system_supports_bti() ? PTE_GP : 0)
+#else
+#define PTE_MAYBE_GP 0
+#endif
+
#define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG)
#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG)
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index acb0751a6606..b8f158ae2527 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -14,6 +14,7 @@
typedef u64 pteval_t;
typedef u64 pmdval_t;
typedef u64 pudval_t;
+typedef u64 p4dval_t;
typedef u64 pgdval_t;
/*
@@ -44,13 +45,11 @@ typedef struct { pteval_t pgprot; } pgprot_t;
#define __pgprot(x) ((pgprot_t) { (x) } )
#if CONFIG_PGTABLE_LEVELS == 2
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
#elif CONFIG_PGTABLE_LEVELS == 3
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopud.h>
#elif CONFIG_PGTABLE_LEVELS == 4
-#include <asm-generic/5level-fixup.h>
+#include <asm-generic/pgtable-nop4d.h>
#endif
#endif /* __ASM_PGTABLE_TYPES_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 538c85e62f86..1f3218fc52fc 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -298,6 +298,11 @@ static inline pte_t pgd_pte(pgd_t pgd)
return __pte(pgd_val(pgd));
}
+static inline pte_t p4d_pte(p4d_t p4d)
+{
+ return __pte(p4d_val(p4d));
+}
+
static inline pte_t pud_pte(pud_t pud)
{
return __pte(pud_val(pud));
@@ -366,7 +371,7 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
+#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID))
#define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd))
@@ -401,12 +406,18 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
+#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d))
+#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys)
+
#define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd))
#define __phys_to_pgd_val(phys) __phys_to_pte_val(phys)
#define __pgprot_modify(prot,mask,bits) \
__pgprot((pgprot_val(prot) & ~(mask)) | (bits))
+#define pgprot_nx(prot) \
+ __pgprot_modify(prot, 0, PTE_PXN)
+
/*
* Mark the prot value as uncacheable and unbufferable.
*/
@@ -457,6 +468,7 @@ extern pgd_t init_pg_dir[PTRS_PER_PGD];
extern pgd_t init_pg_end[];
extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
extern pgd_t idmap_pg_dir[PTRS_PER_PGD];
+extern pgd_t idmap_pg_end[];
extern pgd_t tramp_pg_dir[PTRS_PER_PGD];
extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);
@@ -508,7 +520,7 @@ static inline void pte_unmap(pte_t *pte) { }
#define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr))
#define pte_clear_fixmap() clear_fixmap(FIX_PTE)
-#define pmd_page(pmd) pfn_to_page(__phys_to_pfn(__pmd_to_phys(pmd)))
+#define pmd_page(pmd) phys_to_page(__pmd_to_phys(pmd))
/* use ONLY for statically allocated translation tables */
#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
@@ -566,7 +578,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
#define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr))
#define pmd_clear_fixmap() clear_fixmap(FIX_PMD)
-#define pud_page(pud) pfn_to_page(__phys_to_pfn(__pud_to_phys(pud)))
+#define pud_page(pud) phys_to_page(__pud_to_phys(pud))
/* use ONLY for statically allocated translation tables */
#define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))
@@ -588,49 +600,50 @@ static inline phys_addr_t pud_page_paddr(pud_t pud)
#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud))
-#define pgd_none(pgd) (!pgd_val(pgd))
-#define pgd_bad(pgd) (!(pgd_val(pgd) & 2))
-#define pgd_present(pgd) (pgd_val(pgd))
+#define p4d_none(p4d) (!p4d_val(p4d))
+#define p4d_bad(p4d) (!(p4d_val(p4d) & 2))
+#define p4d_present(p4d) (p4d_val(p4d))
-static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
- if (in_swapper_pgdir(pgdp)) {
- set_swapper_pgd(pgdp, pgd);
+ if (in_swapper_pgdir(p4dp)) {
+ set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
return;
}
- WRITE_ONCE(*pgdp, pgd);
+ WRITE_ONCE(*p4dp, p4d);
dsb(ishst);
isb();
}
-static inline void pgd_clear(pgd_t *pgdp)
+static inline void p4d_clear(p4d_t *p4dp)
{
- set_pgd(pgdp, __pgd(0));
+ set_p4d(p4dp, __p4d(0));
}
-static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
+static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
{
- return __pgd_to_phys(pgd);
+ return __p4d_to_phys(p4d);
}
/* Find an entry in the frst-level page table. */
#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
-#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))
+#define pud_offset_phys(dir, addr) (p4d_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t))
#define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr))))
#define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr))
-#define pud_set_fixmap_offset(pgd, addr) pud_set_fixmap(pud_offset_phys(pgd, addr))
+#define pud_set_fixmap_offset(p4d, addr) pud_set_fixmap(pud_offset_phys(p4d, addr))
#define pud_clear_fixmap() clear_fixmap(FIX_PUD)
-#define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))
+#define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d)))
/* use ONLY for statically allocated translation tables */
#define pud_offset_kimg(dir,addr) ((pud_t *)__phys_to_kimg(pud_offset_phys((dir), (addr))))
#else
+#define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;})
#define pgd_page_paddr(pgd) ({ BUILD_BUG(); 0;})
/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
@@ -660,7 +673,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
- PTE_PROT_NONE | PTE_VALID | PTE_WRITE;
+ PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP;
/* preserve the hardware dirty information */
if (pte_hw_dirty(pte))
pte = pte_mkdirty(pte);
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index bf57308fcd63..953b6a1ce549 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -35,6 +35,7 @@
#define GIC_PRIO_PSR_I_SET (1 << 4)
/* Additional SPSR bits not exposed in the UABI */
+#define PSR_MODE_THREAD_BIT (1 << 0)
#define PSR_IL_BIT (1 << 20)
/* AArch32-specific ptrace requests */
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
new file mode 100644
index 000000000000..eaa2cd92e4c1
--- /dev/null
+++ b/arch/arm64/include/asm/scs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_SCS_H
+#define _ASM_SCS_H
+
+#ifdef __ASSEMBLY__
+
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+ scs_sp .req x18
+
+ .macro scs_load tsk, tmp
+ ldr scs_sp, [\tsk, #TSK_TI_SCS_SP]
+ .endm
+
+ .macro scs_save tsk, tmp
+ str scs_sp, [\tsk, #TSK_TI_SCS_SP]
+ .endm
+#else
+ .macro scs_load tsk, tmp
+ .endm
+
+ .macro scs_save tsk, tmp
+ .endm
+#endif /* CONFIG_SHADOW_CALL_STACK */
+
+#endif /* __ASSEMBLY __ */
+
+#endif /* _ASM_SCS_H */
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 40d5ba029615..ea268d88b6f7 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -23,14 +23,6 @@
#define CPU_STUCK_REASON_52_BIT_VA (UL(1) << CPU_STUCK_REASON_SHIFT)
#define CPU_STUCK_REASON_NO_GRAN (UL(2) << CPU_STUCK_REASON_SHIFT)
-/* Possible options for __cpu_setup */
-/* Option to setup primary cpu */
-#define ARM64_CPU_BOOT_PRIMARY (1)
-/* Option to setup secondary cpus */
-#define ARM64_CPU_BOOT_SECONDARY (2)
-/* Option to setup cpus for different cpu run time services */
-#define ARM64_CPU_RUNTIME (3)
-
#ifndef __ASSEMBLY__
#include <asm/percpu.h>
@@ -96,9 +88,6 @@ asmlinkage void secondary_start_kernel(void);
struct secondary_data {
void *stack;
struct task_struct *task;
-#ifdef CONFIG_ARM64_PTR_AUTH
- struct ptrauth_keys_kernel ptrauth_key;
-#endif
long status;
};
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 4d9b1f48dc39..5017b531a415 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -68,12 +68,10 @@ extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk);
DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);
-static inline bool on_irq_stack(unsigned long sp,
+static inline bool on_stack(unsigned long sp, unsigned long low,
+ unsigned long high, enum stack_type type,
struct stack_info *info)
{
- unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
- unsigned long high = low + IRQ_STACK_SIZE;
-
if (!low)
return false;
@@ -83,12 +81,20 @@ static inline bool on_irq_stack(unsigned long sp,
if (info) {
info->low = low;
info->high = high;
- info->type = STACK_TYPE_IRQ;
+ info->type = type;
}
-
return true;
}
+static inline bool on_irq_stack(unsigned long sp,
+ struct stack_info *info)
+{
+ unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
+ unsigned long high = low + IRQ_STACK_SIZE;
+
+ return on_stack(sp, low, high, STACK_TYPE_IRQ, info);
+}
+
static inline bool on_task_stack(const struct task_struct *tsk,
unsigned long sp,
struct stack_info *info)
@@ -96,16 +102,7 @@ static inline bool on_task_stack(const struct task_struct *tsk,
unsigned long low = (unsigned long)task_stack_page(tsk);
unsigned long high = low + THREAD_SIZE;
- if (sp < low || sp >= high)
- return false;
-
- if (info) {
- info->low = low;
- info->high = high;
- info->type = STACK_TYPE_TASK;
- }
-
- return true;
+ return on_stack(sp, low, high, STACK_TYPE_TASK, info);
}
#ifdef CONFIG_VMAP_STACK
@@ -117,16 +114,7 @@ static inline bool on_overflow_stack(unsigned long sp,
unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack);
unsigned long high = low + OVERFLOW_STACK_SIZE;
- if (sp < low || sp >= high)
- return false;
-
- if (info) {
- info->low = low;
- info->high = high;
- info->type = STACK_TYPE_OVERFLOW;
- }
-
- return true;
+ return on_stack(sp, low, high, STACK_TYPE_OVERFLOW, info);
}
#else
static inline bool on_overflow_stack(unsigned long sp,
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 326aac658b9d..9a364aeae5fb 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -68,41 +68,67 @@ static inline bool kvm_stage2_has_pud(struct kvm *kvm)
#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
+#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
+#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
+#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
+#define stage2_pgd_populate(kvm, pgd, p4d) pgd_populate(NULL, pgd, p4d)
+
+static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
+ pgd_t *pgd, unsigned long address)
+{
+ return p4d_offset(pgd, address);
+}
+
+static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
+{
+}
+
+static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
+{
+ return false;
+}
+
+static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
+ phys_addr_t addr, phys_addr_t end)
+{
+ return end;
+}
+
+static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
{
if (kvm_stage2_has_pud(kvm))
- return pgd_none(pgd);
+ return p4d_none(p4d);
else
return 0;
}
-static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
+static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
{
if (kvm_stage2_has_pud(kvm))
- pgd_clear(pgdp);
+ p4d_clear(p4dp);
}
-static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
+static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
{
if (kvm_stage2_has_pud(kvm))
- return pgd_present(pgd);
+ return p4d_present(p4d);
else
return 1;
}
-static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
+static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
{
if (kvm_stage2_has_pud(kvm))
- pgd_populate(NULL, pgd, pud);
+ p4d_populate(NULL, p4d, pud);
}
static inline pud_t *stage2_pud_offset(struct kvm *kvm,
- pgd_t *pgd, unsigned long address)
+ p4d_t *p4d, unsigned long address)
{
if (kvm_stage2_has_pud(kvm))
- return pud_offset(pgd, address);
+ return pud_offset(p4d, address);
else
- return (pud_t *)pgd;
+ return (pud_t *)p4d;
}
static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 8939c87c4dce..0cde2f473971 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -2,7 +2,7 @@
#ifndef __ASM_SUSPEND_H
#define __ASM_SUSPEND_H
-#define NR_CTX_REGS 12
+#define NR_CTX_REGS 13
#define NR_CALLEE_SAVED_REGS 12
/*
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index c4ac0ac25a00..463175f80341 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -105,6 +105,10 @@
#define SYS_DC_CSW sys_insn(1, 0, 7, 10, 2)
#define SYS_DC_CISW sys_insn(1, 0, 7, 14, 2)
+/*
+ * System registers, organised loosely by encoding but grouped together
+ * where the architected name contains an index. e.g. ID_MMFR<n>_EL1.
+ */
#define SYS_OSDTRRX_EL1 sys_reg(2, 0, 0, 0, 2)
#define SYS_MDCCINT_EL1 sys_reg(2, 0, 0, 2, 0)
#define SYS_MDSCR_EL1 sys_reg(2, 0, 0, 2, 2)
@@ -134,12 +138,16 @@
#define SYS_ID_PFR0_EL1 sys_reg(3, 0, 0, 1, 0)
#define SYS_ID_PFR1_EL1 sys_reg(3, 0, 0, 1, 1)
+#define SYS_ID_PFR2_EL1 sys_reg(3, 0, 0, 3, 4)
#define SYS_ID_DFR0_EL1 sys_reg(3, 0, 0, 1, 2)
+#define SYS_ID_DFR1_EL1 sys_reg(3, 0, 0, 3, 5)
#define SYS_ID_AFR0_EL1 sys_reg(3, 0, 0, 1, 3)
#define SYS_ID_MMFR0_EL1 sys_reg(3, 0, 0, 1, 4)
#define SYS_ID_MMFR1_EL1 sys_reg(3, 0, 0, 1, 5)
#define SYS_ID_MMFR2_EL1 sys_reg(3, 0, 0, 1, 6)
#define SYS_ID_MMFR3_EL1 sys_reg(3, 0, 0, 1, 7)
+#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6)
+#define SYS_ID_MMFR5_EL1 sys_reg(3, 0, 0, 3, 6)
#define SYS_ID_ISAR0_EL1 sys_reg(3, 0, 0, 2, 0)
#define SYS_ID_ISAR1_EL1 sys_reg(3, 0, 0, 2, 1)
@@ -147,7 +155,6 @@
#define SYS_ID_ISAR3_EL1 sys_reg(3, 0, 0, 2, 3)
#define SYS_ID_ISAR4_EL1 sys_reg(3, 0, 0, 2, 4)
#define SYS_ID_ISAR5_EL1 sys_reg(3, 0, 0, 2, 5)
-#define SYS_ID_MMFR4_EL1 sys_reg(3, 0, 0, 2, 6)
#define SYS_ID_ISAR6_EL1 sys_reg(3, 0, 0, 2, 7)
#define SYS_MVFR0_EL1 sys_reg(3, 0, 0, 3, 0)
@@ -552,6 +559,8 @@
#endif
/* SCTLR_EL1 specific flags. */
+#define SCTLR_EL1_BT1 (BIT(36))
+#define SCTLR_EL1_BT0 (BIT(35))
#define SCTLR_EL1_UCI (BIT(26))
#define SCTLR_EL1_E0E (BIT(24))
#define SCTLR_EL1_SPAN (BIT(23))
@@ -594,6 +603,7 @@
/* id_aa64isar0 */
#define ID_AA64ISAR0_RNDR_SHIFT 60
+#define ID_AA64ISAR0_TLB_SHIFT 56
#define ID_AA64ISAR0_TS_SHIFT 52
#define ID_AA64ISAR0_FHM_SHIFT 48
#define ID_AA64ISAR0_DP_SHIFT 44
@@ -637,6 +647,8 @@
#define ID_AA64PFR0_CSV2_SHIFT 56
#define ID_AA64PFR0_DIT_SHIFT 48
#define ID_AA64PFR0_AMU_SHIFT 44
+#define ID_AA64PFR0_MPAM_SHIFT 40
+#define ID_AA64PFR0_SEL2_SHIFT 36
#define ID_AA64PFR0_SVE_SHIFT 32
#define ID_AA64PFR0_RAS_SHIFT 28
#define ID_AA64PFR0_GIC_SHIFT 24
@@ -655,15 +667,21 @@
#define ID_AA64PFR0_ASIMD_NI 0xf
#define ID_AA64PFR0_ASIMD_SUPPORTED 0x0
#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1
+#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2
#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1
#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2
/* id_aa64pfr1 */
+#define ID_AA64PFR1_MPAMFRAC_SHIFT 16
+#define ID_AA64PFR1_RASFRAC_SHIFT 12
+#define ID_AA64PFR1_MTE_SHIFT 8
#define ID_AA64PFR1_SSBS_SHIFT 4
+#define ID_AA64PFR1_BT_SHIFT 0
#define ID_AA64PFR1_SSBS_PSTATE_NI 0
#define ID_AA64PFR1_SSBS_PSTATE_ONLY 1
#define ID_AA64PFR1_SSBS_PSTATE_INSNS 2
+#define ID_AA64PFR1_BT_BTI 0x1
/* id_aa64zfr0 */
#define ID_AA64ZFR0_F64MM_SHIFT 56
@@ -688,6 +706,9 @@
#define ID_AA64ZFR0_SVEVER_SVE2 0x1
/* id_aa64mmfr0 */
+#define ID_AA64MMFR0_TGRAN4_2_SHIFT 40
+#define ID_AA64MMFR0_TGRAN64_2_SHIFT 36
+#define ID_AA64MMFR0_TGRAN16_2_SHIFT 32
#define ID_AA64MMFR0_TGRAN4_SHIFT 28
#define ID_AA64MMFR0_TGRAN64_SHIFT 24
#define ID_AA64MMFR0_TGRAN16_SHIFT 20
@@ -752,6 +773,25 @@
#define ID_DFR0_PERFMON_8_1 0x4
+#define ID_ISAR4_SWP_FRAC_SHIFT 28
+#define ID_ISAR4_PSR_M_SHIFT 24
+#define ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT 20
+#define ID_ISAR4_BARRIER_SHIFT 16
+#define ID_ISAR4_SMC_SHIFT 12
+#define ID_ISAR4_WRITEBACK_SHIFT 8
+#define ID_ISAR4_WITHSHIFTS_SHIFT 4
+#define ID_ISAR4_UNPRIV_SHIFT 0
+
+#define ID_DFR1_MTPMU_SHIFT 0
+
+#define ID_ISAR0_DIVIDE_SHIFT 24
+#define ID_ISAR0_DEBUG_SHIFT 20
+#define ID_ISAR0_COPROC_SHIFT 16
+#define ID_ISAR0_CMPBRANCH_SHIFT 12
+#define ID_ISAR0_BITFIELD_SHIFT 8
+#define ID_ISAR0_BITCOUNT_SHIFT 4
+#define ID_ISAR0_SWAP_SHIFT 0
+
#define ID_ISAR5_RDM_SHIFT 24
#define ID_ISAR5_CRC32_SHIFT 16
#define ID_ISAR5_SHA2_SHIFT 12
@@ -767,6 +807,22 @@
#define ID_ISAR6_DP_SHIFT 4
#define ID_ISAR6_JSCVT_SHIFT 0
+#define ID_MMFR4_EVT_SHIFT 28
+#define ID_MMFR4_CCIDX_SHIFT 24
+#define ID_MMFR4_LSM_SHIFT 20
+#define ID_MMFR4_HPDS_SHIFT 16
+#define ID_MMFR4_CNP_SHIFT 12
+#define ID_MMFR4_XNX_SHIFT 8
+#define ID_MMFR4_SPECSEI_SHIFT 0
+
+#define ID_MMFR5_ETS_SHIFT 0
+
+#define ID_PFR0_DIT_SHIFT 24
+#define ID_PFR0_CSV2_SHIFT 16
+
+#define ID_PFR2_SSBS_SHIFT 4
+#define ID_PFR2_CSV3_SHIFT 0
+
#define MVFR0_FPROUND_SHIFT 28
#define MVFR0_FPSHVEC_SHIFT 24
#define MVFR0_FPSQRT_SHIFT 20
@@ -785,17 +841,14 @@
#define MVFR1_FPDNAN_SHIFT 4
#define MVFR1_FPFTZ_SHIFT 0
-
-#define ID_AA64MMFR0_TGRAN4_SHIFT 28
-#define ID_AA64MMFR0_TGRAN64_SHIFT 24
-#define ID_AA64MMFR0_TGRAN16_SHIFT 20
-
-#define ID_AA64MMFR0_TGRAN4_NI 0xf
-#define ID_AA64MMFR0_TGRAN4_SUPPORTED 0x0
-#define ID_AA64MMFR0_TGRAN64_NI 0xf
-#define ID_AA64MMFR0_TGRAN64_SUPPORTED 0x0
-#define ID_AA64MMFR0_TGRAN16_NI 0x0
-#define ID_AA64MMFR0_TGRAN16_SUPPORTED 0x1
+#define ID_PFR1_GIC_SHIFT 28
+#define ID_PFR1_VIRT_FRAC_SHIFT 24
+#define ID_PFR1_SEC_FRAC_SHIFT 20
+#define ID_PFR1_GENTIMER_SHIFT 16
+#define ID_PFR1_VIRTUALIZATION_SHIFT 12
+#define ID_PFR1_MPROGMOD_SHIFT 8
+#define ID_PFR1_SECURITY_SHIFT 4
+#define ID_PFR1_PROGMOD_SHIFT 0
#if defined(CONFIG_ARM64_4K_PAGES)
#define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 512174a8e789..6ea8b6a26ae9 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -41,6 +41,10 @@ struct thread_info {
#endif
} preempt;
};
+#ifdef CONFIG_SHADOW_CALL_STACK
+ void *scs_base;
+ void *scs_sp;
+#endif
};
#define thread_saved_pc(tsk) \
@@ -100,11 +104,20 @@ void arch_release_task_struct(struct task_struct *tsk);
_TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
_TIF_SYSCALL_EMU)
+#ifdef CONFIG_SHADOW_CALL_STACK
+#define INIT_SCS \
+ .scs_base = init_shadow_call_stack, \
+ .scs_sp = init_shadow_call_stack,
+#else
+#define INIT_SCS
+#endif
+
#define INIT_THREAD_INFO(tsk) \
{ \
.flags = _TIF_FOREIGN_FPSTATE, \
.preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
+ INIT_SCS \
}
#endif /* __ASM_THREAD_INFO_H */
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 32fc8061aa76..bc5c7b091152 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -304,7 +304,7 @@ do { \
__p = uaccess_mask_ptr(__p); \
__raw_get_user((x), __p, (err)); \
} else { \
- (x) = 0; (err) = -EFAULT; \
+ (x) = (__force __typeof__(x))0; (err) = -EFAULT; \
} \
} while (0)
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 803039d504de..3b859596840d 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-#define __NR_compat_syscalls 439
+#define __NR_compat_syscalls 440
#endif
#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c1c61635f89c..6d95d0c8bf2f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -883,6 +883,8 @@ __SYSCALL(__NR_clone3, sys_clone3)
__SYSCALL(__NR_openat2, sys_openat2)
#define __NR_pidfd_getfd 438
__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+#define __NR_faccessat2 439
+__SYSCALL(__NR_faccessat2, sys_faccessat2)
/*
* Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 61fd26752adc..5051b388c654 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -85,7 +85,7 @@ static inline bool is_kernel_in_hyp_mode(void)
static __always_inline bool has_vhe(void)
{
- if (cpus_have_const_cap(ARM64_HAS_VIRT_HOST_EXTN))
+ if (cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN))
return true;
return false;
diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h
index 0a12115d9638..0cc6636e3f15 100644
--- a/arch/arm64/include/asm/vmap_stack.h
+++ b/arch/arm64/include/asm/vmap_stack.h
@@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node)
{
BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK));
- return __vmalloc_node_range(stack_size, THREAD_ALIGN,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP, PAGE_KERNEL, 0, node,
- __builtin_return_address(0));
+ return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node,
+ __builtin_return_address(0));
}
#endif /* __ASM_VMAP_STACK_H */
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 7752d93bb50f..2d6ba1c2592e 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -73,5 +73,6 @@
#define HWCAP2_BF16 (1 << 14)
#define HWCAP2_DGH (1 << 15)
#define HWCAP2_RNG (1 << 16)
+#define HWCAP2_BTI (1 << 17)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h
new file mode 100644
index 000000000000..6fdd71eb644f
--- /dev/null
+++ b/arch/arm64/include/uapi/asm/mman.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI__ASM_MMAN_H
+#define _UAPI__ASM_MMAN_H
+
+#include <asm-generic/mman.h>
+
+#define PROT_BTI 0x10 /* BTI guarded page */
+
+#endif /* ! _UAPI__ASM_MMAN_H */
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index d1bb5b69f1ce..42cbe34d95ce 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -46,6 +46,7 @@
#define PSR_I_BIT 0x00000080
#define PSR_A_BIT 0x00000100
#define PSR_D_BIT 0x00000200
+#define PSR_BTYPE_MASK 0x00000c00
#define PSR_SSBS_BIT 0x00001000
#define PSR_PAN_BIT 0x00400000
#define PSR_UAO_BIT 0x00800000
@@ -55,6 +56,8 @@
#define PSR_Z_BIT 0x40000000
#define PSR_N_BIT 0x80000000
+#define PSR_BTYPE_SHIFT 10
+
/*
* Groups of PSR bits
*/
@@ -63,6 +66,12 @@
#define PSR_x 0x0000ff00 /* Extension */
#define PSR_c 0x000000ff /* Control */
+/* Convenience names for the values of PSTATE.BTYPE */
+#define PSR_BTYPE_NONE (0b00 << PSR_BTYPE_SHIFT)
+#define PSR_BTYPE_JC (0b01 << PSR_BTYPE_SHIFT)
+#define PSR_BTYPE_C (0b10 << PSR_BTYPE_SHIFT)
+#define PSR_BTYPE_J (0b11 << PSR_BTYPE_SHIFT)
+
/* syscall emulation path in ptrace */
#define PTRACE_SYSEMU 31
#define PTRACE_SYSEMU_SINGLESTEP 32
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 4e5b8ee31442..151f28521f1e 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
obj-$(CONFIG_ARM64_SSBD) += ssbd.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
+obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-y += vdso/ probes/
obj-$(CONFIG_COMPAT_VDSO) += vdso32/
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index a100483b47c4..46ec402e97ed 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -19,6 +19,7 @@
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
+#include <linux/irq_work.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/smp.h>
@@ -269,6 +270,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
int apei_claim_sea(struct pt_regs *regs)
{
int err = -ENOENT;
+ bool return_to_irqs_enabled;
unsigned long current_flags;
if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
@@ -276,6 +278,12 @@ int apei_claim_sea(struct pt_regs *regs)
current_flags = local_daif_save_flags();
+ /* current_flags isn't useful here as daif doesn't tell us about pNMI */
+ return_to_irqs_enabled = !irqs_disabled_flags(arch_local_save_flags());
+
+ if (regs)
+ return_to_irqs_enabled = interrupts_enabled(regs);
+
/*
* SEA can interrupt SError, mask it and describe this as an NMI so
* that APEI defers the handling.
@@ -284,6 +292,23 @@ int apei_claim_sea(struct pt_regs *regs)
nmi_enter();
err = ghes_notify_sea();
nmi_exit();
+
+ /*
+ * APEI NMI-like notifications are deferred to irq_work. Unless
+ * we interrupted irqs-masked code, we can do that now.
+ */
+ if (!err) {
+ if (return_to_irqs_enabled) {
+ local_daif_restore(DAIF_PROCCTX_NOIRQ);
+ __irq_enter();
+ irq_work_run();
+ __irq_exit();
+ } else {
+ pr_warn_ratelimited("APEI work queued but not completed");
+ err = -EINPROGRESS;
+ }
+ }
+
local_daif_restore(current_flags);
return err;
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index c19aa81ddc8c..7364de008bab 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops)
}
static int emulation_proc_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = 0;
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 9981a0a5a87f..0577e2142284 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -34,6 +34,10 @@ int main(void)
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0));
#endif
+#ifdef CONFIG_SHADOW_CALL_STACK
+ DEFINE(TSK_TI_SCS_BASE, offsetof(struct task_struct, thread_info.scs_base));
+ DEFINE(TSK_TI_SCS_SP, offsetof(struct task_struct, thread_info.scs_sp));
+#endif
DEFINE(TSK_STACK, offsetof(struct task_struct, stack));
#ifdef CONFIG_STACKPROTECTOR
DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary));
@@ -92,11 +96,8 @@ int main(void)
BLANK();
DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack));
DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
-#ifdef CONFIG_ARM64_PTR_AUTH
- DEFINE(CPU_BOOT_PTRAUTH_KEY, offsetof(struct secondary_data, ptrauth_key));
-#endif
BLANK();
-#ifdef CONFIG_KVM_ARM_HOST
+#ifdef CONFIG_KVM
DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt));
DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1));
DEFINE(VCPU_WORKAROUND_FLAGS, offsetof(struct kvm_vcpu, arch.workaround_flags));
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 38087b4c0432..4a18055b2ff9 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -29,7 +29,7 @@
* branch to what would be the reset vector. It must be executed with the
* flat identity mapping.
*/
-ENTRY(__cpu_soft_restart)
+SYM_CODE_START(__cpu_soft_restart)
/* Clear sctlr_el1 flags. */
mrs x12, sctlr_el1
mov_q x13, SCTLR_ELx_FLAGS
@@ -47,6 +47,6 @@ ENTRY(__cpu_soft_restart)
mov x1, x3 // arg1
mov x2, x4 // arg2
br x8
-ENDPROC(__cpu_soft_restart)
+SYM_CODE_END(__cpu_soft_restart)
.popsection
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index df56d2295d16..ad06d6802d2e 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -234,7 +234,7 @@ static int detect_harden_bp_fw(void)
smccc_end = NULL;
break;
-#if IS_ENABLED(CONFIG_KVM_ARM_HOST)
+#if IS_ENABLED(CONFIG_KVM)
case SMCCC_CONDUIT_SMC:
cb = call_smc_arch_workaround_1;
smccc_start = __smccc_workaround_1_smc;
@@ -635,7 +635,7 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry,
return is_midr_in_range(midr, &range) && has_dic;
}
-#if defined(CONFIG_HARDEN_EL2_VECTORS) || defined(CONFIG_ARM64_ERRATUM_1319367)
+#if defined(CONFIG_HARDEN_EL2_VECTORS)
static const struct midr_range ca57_a72[] = {
MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
@@ -757,12 +757,16 @@ static const struct arm64_cpu_capabilities erratum_843419_list[] = {
};
#endif
-#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE
-static const struct midr_range erratum_speculative_at_vhe_list[] = {
+#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT
+static const struct midr_range erratum_speculative_at_list[] = {
#ifdef CONFIG_ARM64_ERRATUM_1165522
/* Cortex A76 r0p0 to r2p0 */
MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0),
#endif
+#ifdef CONFIG_ARM64_ERRATUM_1319367
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+#endif
#ifdef CONFIG_ARM64_ERRATUM_1530923
/* Cortex A55 r0p0 to r2p0 */
MIDR_RANGE(MIDR_CORTEX_A55, 0, 0, 2, 0),
@@ -774,7 +778,7 @@ static const struct midr_range erratum_speculative_at_vhe_list[] = {
const struct arm64_cpu_capabilities arm64_errata[] = {
#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
{
- .desc = "ARM errata 826319, 827319, 824069, 819472",
+ .desc = "ARM errata 826319, 827319, 824069, or 819472",
.capability = ARM64_WORKAROUND_CLEAN_CACHE,
ERRATA_MIDR_RANGE_LIST(workaround_clean_cache),
.cpu_enable = cpu_enable_cache_maint_trap,
@@ -856,7 +860,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
#endif
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
{
- .desc = "Qualcomm erratum 1009, ARM erratum 1286807",
+ .desc = "Qualcomm erratum 1009, or ARM erratum 1286807",
.capability = ARM64_WORKAROUND_REPEAT_TLBI,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = cpucap_multi_entry_cap_matches,
@@ -897,11 +901,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
ERRATA_MIDR_RANGE_LIST(erratum_1418040_list),
},
#endif
-#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT_VHE
+#ifdef CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT
{
- .desc = "ARM errata 1165522, 1530923",
- .capability = ARM64_WORKAROUND_SPECULATIVE_AT_VHE,
- ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_vhe_list),
+ .desc = "ARM errata 1165522, 1319367, or 1530923",
+ .capability = ARM64_WORKAROUND_SPECULATIVE_AT,
+ ERRATA_MIDR_RANGE_LIST(erratum_speculative_at_list),
},
#endif
#ifdef CONFIG_ARM64_ERRATUM_1463225
@@ -935,13 +939,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.cpu_enable = cpu_enable_trap_ctr_access,
},
#endif
-#ifdef CONFIG_ARM64_ERRATUM_1319367
- {
- .desc = "ARM erratum 1319367",
- .capability = ARM64_WORKAROUND_SPECULATIVE_AT_NVHE,
- ERRATA_MIDR_RANGE_LIST(ca57_a72),
- },
-#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9fac745aa7bb..4ae41670c2e6 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3,6 +3,61 @@
* Contains CPU feature definitions
*
* Copyright (C) 2015 ARM Ltd.
+ *
+ * A note for the weary kernel hacker: the code here is confusing and hard to
+ * follow! That's partly because it's solving a nasty problem, but also because
+ * there's a little bit of over-abstraction that tends to obscure what's going
+ * on behind a maze of helper functions and macros.
+ *
+ * The basic problem is that hardware folks have started gluing together CPUs
+ * with distinct architectural features; in some cases even creating SoCs where
+ * user-visible instructions are available only on a subset of the available
+ * cores. We try to address this by snapshotting the feature registers of the
+ * boot CPU and comparing these with the feature registers of each secondary
+ * CPU when bringing them up. If there is a mismatch, then we update the
+ * snapshot state to indicate the lowest-common denominator of the feature,
+ * known as the "safe" value. This snapshot state can be queried to view the
+ * "sanitised" value of a feature register.
+ *
+ * The sanitised register values are used to decide which capabilities we
+ * have in the system. These may be in the form of traditional "hwcaps"
+ * advertised to userspace or internal "cpucaps" which are used to configure
+ * things like alternative patching and static keys. While a feature mismatch
+ * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch
+ * may prevent a CPU from being onlined at all.
+ *
+ * Some implementation details worth remembering:
+ *
+ * - Mismatched features are *always* sanitised to a "safe" value, which
+ * usually indicates that the feature is not supported.
+ *
+ * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK"
+ * warning when onlining an offending CPU and the kernel will be tainted
+ * with TAINT_CPU_OUT_OF_SPEC.
+ *
+ * - Features marked as FTR_VISIBLE have their sanitised value visible to
+ * userspace. FTR_VISIBLE features in registers that are only visible
+ * to EL0 by trapping *must* have a corresponding HWCAP so that late
+ * onlining of CPUs cannot lead to features disappearing at runtime.
+ *
+ * - A "feature" is typically a 4-bit register field. A "capability" is the
+ * high-level description derived from the sanitised field value.
+ *
+ * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID
+ * scheme for fields in ID registers") to understand when feature fields
+ * may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly).
+ *
+ * - KVM exposes its own view of the feature registers to guest operating
+ * systems regardless of FTR_VISIBLE. This is typically driven from the
+ * sanitised register values to allow virtual CPUs to be migrated between
+ * arbitrary physical CPUs, but some features not present on the host are
+ * also advertised and emulated. Look at sys_reg_descs[] for the gory
+ * details.
+ *
+ * - If the arm64_ftr_bits[] for a register has a missing field, then this
+ * field is treated as STRICT RES0, including for read_sanitised_ftr_reg().
+ * This is stronger than FTR_HIDDEN and can be used to hide features from
+ * KVM guests.
*/
#define pr_fmt(fmt) "CPU features: " fmt
@@ -124,6 +179,7 @@ static bool __system_matches_cap(unsigned int n);
*/
static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLB_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0),
@@ -166,22 +222,27 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_DIT_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_AMU_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_MPAM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SEL2_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
- /* Linux doesn't care about the EL3 */
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY),
ARM64_FTR_END,
};
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
+ ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -209,6 +270,24 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
/*
+ * Page size not being supported at Stage-2 is not fatal. You
+ * just give up KVM if PAGE_SIZE isn't supported there. Go fix
+ * your favourite nesting hypervisor.
+ *
+ * There is a small corner case where the hypervisor explicitly
+ * advertises a given granule size at Stage-2 (value 2) on some
+ * vCPUs, and uses the fallback to Stage-1 (value 0) for other
+ * vCPUs. Although this is not forbidden by the architecture, it
+ * indicates that the hypervisor is being silly (or buggy).
+ *
+ * We make no effort to cope with this and pretend that if these
+ * fields are inconsistent across vCPUs, then it isn't worth
+ * trying to bring KVM up.
+ */
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN4_2_SHIFT, 4, 1),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN64_2_SHIFT, 4, 1),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_TGRAN16_2_SHIFT, 4, 1),
+ /*
* We already refuse to boot CPUs that don't support our configured
* page size, so we can only detect mismatches for a page size other
* than the one we're currently using. Unfortunately, SoCs like this
@@ -247,7 +326,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_UAO_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CNP_SHIFT, 4, 0),
@@ -289,7 +368,7 @@ static const struct arm64_ftr_bits ftr_id_mmfr0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 36, 28, 0),
+ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0),
@@ -316,6 +395,16 @@ static const struct arm64_ftr_bits ftr_dczid[] = {
ARM64_FTR_END,
};
+static const struct arm64_ftr_bits ftr_id_isar0[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DIVIDE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_DEBUG_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_COPROC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_CMPBRANCH_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITFIELD_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_BITCOUNT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_SWAP_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
static const struct arm64_ftr_bits ftr_id_isar5[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_RDM_SHIFT, 4, 0),
@@ -328,7 +417,37 @@ static const struct arm64_ftr_bits ftr_id_isar5[] = {
};
static const struct arm64_ftr_bits ftr_id_mmfr4[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EVT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CCIDX_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_LSM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */
+ /*
+ * SpecSEI = 1 indicates that the PE might generate an SError on an
+ * external abort on speculative read. It is safe to assume that an
+ * SError might be generated than it will not be. Hence it has been
+ * classified as FTR_HIGHER_SAFE.
+ */
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_SPECSEI_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static const struct arm64_ftr_bits ftr_id_isar4[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SWP_FRAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_PSR_M_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SYNCH_PRIM_FRAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_BARRIER_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_SMC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WRITEBACK_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_WITHSHIFTS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_UNPRIV_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static const struct arm64_ftr_bits ftr_id_mmfr5[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_ETS_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -344,6 +463,8 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = {
};
static const struct arm64_ftr_bits ftr_id_pfr0[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */
@@ -351,8 +472,26 @@ static const struct arm64_ftr_bits ftr_id_pfr0[] = {
ARM64_FTR_END,
};
+static const struct arm64_ftr_bits ftr_id_pfr1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GIC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRT_FRAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SEC_FRAC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_GENTIMER_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_VIRTUALIZATION_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_MPROGMOD_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_SECURITY_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_PROGMOD_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
+static const struct arm64_ftr_bits ftr_id_pfr2[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
static const struct arm64_ftr_bits ftr_id_dfr0[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0),
+ /* [31:28] TraceFilt */
S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),
@@ -363,6 +502,11 @@ static const struct arm64_ftr_bits ftr_id_dfr0[] = {
ARM64_FTR_END,
};
+static const struct arm64_ftr_bits ftr_id_dfr1[] = {
+ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_MTPMU_SHIFT, 4, 0),
+ ARM64_FTR_END,
+};
+
static const struct arm64_ftr_bits ftr_zcr[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE,
ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_SIZE, 0), /* LEN */
@@ -373,7 +517,7 @@ static const struct arm64_ftr_bits ftr_zcr[] = {
* Common ftr bits for a 32bit register with all hidden, strict
* attributes, with 4bit feature fields and a default safe value of
* 0. Covers the following 32bit registers:
- * id_isar[0-4], id_mmfr[1-3], id_pfr1, mvfr[0-1]
+ * id_isar[1-4], id_mmfr[1-3], id_pfr1, mvfr[0-1]
*/
static const struct arm64_ftr_bits ftr_generic_32bits[] = {
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0),
@@ -411,7 +555,7 @@ static const struct __ftr_reg_entry {
/* Op1 = 0, CRn = 0, CRm = 1 */
ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0),
- ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1),
ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0),
ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0),
ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits),
@@ -419,11 +563,11 @@ static const struct __ftr_reg_entry {
ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits),
/* Op1 = 0, CRn = 0, CRm = 2 */
- ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0),
ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits),
ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits),
ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits),
- ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_generic_32bits),
+ ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4),
ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5),
ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4),
ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6),
@@ -432,6 +576,9 @@ static const struct __ftr_reg_entry {
ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_generic_32bits),
ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_generic_32bits),
ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2),
+ ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2),
+ ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1),
+ ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5),
/* Op1 = 0, CRn = 0, CRm = 4 */
ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0),
@@ -468,16 +615,16 @@ static int search_cmp_ftr_reg(const void *id, const void *regp)
}
/*
- * get_arm64_ftr_reg - Lookup a feature register entry using its
- * sys_reg() encoding. With the array arm64_ftr_regs sorted in the
- * ascending order of sys_id , we use binary search to find a matching
+ * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using
+ * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the
+ * ascending order of sys_id, we use binary search to find a matching
* entry.
*
* returns - Upon success, matching ftr_reg entry for id.
* - NULL on failure. It is upto the caller to decide
* the impact of a failure.
*/
-static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
+static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id)
{
const struct __ftr_reg_entry *ret;
@@ -491,6 +638,27 @@ static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
return NULL;
}
+/*
+ * get_arm64_ftr_reg - Looks up a feature register entry using
+ * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn().
+ *
+ * returns - Upon success, matching ftr_reg entry for id.
+ * - NULL on failure but with an WARN_ON().
+ */
+static struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
+{
+ struct arm64_ftr_reg *reg;
+
+ reg = get_arm64_ftr_reg_nowarn(sys_id);
+
+ /*
+ * Requesting a non-existent register search is an error. Warn
+ * and let the caller handle it.
+ */
+ WARN_ON(!reg);
+ return reg;
+}
+
static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg,
s64 ftr_val)
{
@@ -552,7 +720,8 @@ static void __init init_cpu_ftr_reg(u32 sys_reg, u64 new)
const struct arm64_ftr_bits *ftrp;
struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg);
- BUG_ON(!reg);
+ if (!reg)
+ return;
for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) {
u64 ftr_mask = arm64_ftr_mask(ftrp);
@@ -625,6 +794,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
+ init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
@@ -636,8 +806,11 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
+ init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
+ init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
+ init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
@@ -682,7 +855,9 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot)
{
struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id);
- BUG_ON(!regp);
+ if (!regp)
+ return 0;
+
update_cpu_ftr_reg(regp, val);
if ((boot & regp->strict_mask) == (val & regp->strict_mask))
return 0;
@@ -691,6 +866,104 @@ static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot)
return 1;
}
+static void relax_cpu_ftr_reg(u32 sys_id, int field)
+{
+ const struct arm64_ftr_bits *ftrp;
+ struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id);
+
+ if (!regp)
+ return;
+
+ for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) {
+ if (ftrp->shift == field) {
+ regp->strict_mask &= ~arm64_ftr_mask(ftrp);
+ break;
+ }
+ }
+
+ /* Bogus field? */
+ WARN_ON(!ftrp->width);
+}
+
+static int update_32bit_cpu_features(int cpu, struct cpuinfo_arm64 *info,
+ struct cpuinfo_arm64 *boot)
+{
+ int taint = 0;
+ u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+ /*
+ * If we don't have AArch32 at all then skip the checks entirely
+ * as the register values may be UNKNOWN and we're not going to be
+ * using them for anything.
+ */
+ if (!id_aa64pfr0_32bit_el0(pfr0))
+ return taint;
+
+ /*
+ * If we don't have AArch32 at EL1, then relax the strictness of
+ * EL1-dependent register fields to avoid spurious sanity check fails.
+ */
+ if (!id_aa64pfr0_32bit_el1(pfr0)) {
+ relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_SMC_SHIFT);
+ relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRT_FRAC_SHIFT);
+ relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SEC_FRAC_SHIFT);
+ relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_VIRTUALIZATION_SHIFT);
+ relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_SECURITY_SHIFT);
+ relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_PROGMOD_SHIFT);
+ }
+
+ taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
+ info->reg_id_dfr0, boot->reg_id_dfr0);
+ taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu,
+ info->reg_id_dfr1, boot->reg_id_dfr1);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
+ info->reg_id_isar0, boot->reg_id_isar0);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
+ info->reg_id_isar1, boot->reg_id_isar1);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
+ info->reg_id_isar2, boot->reg_id_isar2);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
+ info->reg_id_isar3, boot->reg_id_isar3);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
+ info->reg_id_isar4, boot->reg_id_isar4);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
+ info->reg_id_isar5, boot->reg_id_isar5);
+ taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu,
+ info->reg_id_isar6, boot->reg_id_isar6);
+
+ /*
+ * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
+ * ACTLR formats could differ across CPUs and therefore would have to
+ * be trapped for virtualization anyway.
+ */
+ taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
+ info->reg_id_mmfr0, boot->reg_id_mmfr0);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
+ info->reg_id_mmfr1, boot->reg_id_mmfr1);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
+ info->reg_id_mmfr2, boot->reg_id_mmfr2);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
+ info->reg_id_mmfr3, boot->reg_id_mmfr3);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu,
+ info->reg_id_mmfr4, boot->reg_id_mmfr4);
+ taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu,
+ info->reg_id_mmfr5, boot->reg_id_mmfr5);
+ taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
+ info->reg_id_pfr0, boot->reg_id_pfr0);
+ taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
+ info->reg_id_pfr1, boot->reg_id_pfr1);
+ taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu,
+ info->reg_id_pfr2, boot->reg_id_pfr2);
+ taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
+ info->reg_mvfr0, boot->reg_mvfr0);
+ taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
+ info->reg_mvfr1, boot->reg_mvfr1);
+ taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
+ info->reg_mvfr2, boot->reg_mvfr2);
+
+ return taint;
+}
+
/*
* Update system wide CPU feature registers with the values from a
* non-boot CPU. Also performs SANITY checks to make sure that there
@@ -753,9 +1026,6 @@ void update_cpu_features(int cpu,
taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
- /*
- * EL3 is not our concern.
- */
taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu,
info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0);
taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu,
@@ -764,55 +1034,6 @@ void update_cpu_features(int cpu,
taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu,
info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0);
- /*
- * If we have AArch32, we care about 32-bit features for compat.
- * If the system doesn't support AArch32, don't update them.
- */
- if (id_aa64pfr0_32bit_el0(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) &&
- id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
-
- taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
- info->reg_id_dfr0, boot->reg_id_dfr0);
- taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
- info->reg_id_isar0, boot->reg_id_isar0);
- taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
- info->reg_id_isar1, boot->reg_id_isar1);
- taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
- info->reg_id_isar2, boot->reg_id_isar2);
- taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
- info->reg_id_isar3, boot->reg_id_isar3);
- taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
- info->reg_id_isar4, boot->reg_id_isar4);
- taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
- info->reg_id_isar5, boot->reg_id_isar5);
- taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu,
- info->reg_id_isar6, boot->reg_id_isar6);
-
- /*
- * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
- * ACTLR formats could differ across CPUs and therefore would have to
- * be trapped for virtualization anyway.
- */
- taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
- info->reg_id_mmfr0, boot->reg_id_mmfr0);
- taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
- info->reg_id_mmfr1, boot->reg_id_mmfr1);
- taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
- info->reg_id_mmfr2, boot->reg_id_mmfr2);
- taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
- info->reg_id_mmfr3, boot->reg_id_mmfr3);
- taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
- info->reg_id_pfr0, boot->reg_id_pfr0);
- taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
- info->reg_id_pfr1, boot->reg_id_pfr1);
- taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
- info->reg_mvfr0, boot->reg_mvfr0);
- taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
- info->reg_mvfr1, boot->reg_mvfr1);
- taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
- info->reg_mvfr2, boot->reg_mvfr2);
- }
-
if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu,
info->reg_zcr, boot->reg_zcr);
@@ -824,6 +1045,12 @@ void update_cpu_features(int cpu,
}
/*
+ * This relies on a sanitised view of the AArch64 ID registers
+ * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
+ */
+ taint |= update_32bit_cpu_features(cpu, info, boot);
+
+ /*
* Mismatched CPU features are a recipe for disaster. Don't even
* pretend to support them.
*/
@@ -837,8 +1064,8 @@ u64 read_sanitised_ftr_reg(u32 id)
{
struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id);
- /* We shouldn't get a request for an unsupported register */
- BUG_ON(!regp);
+ if (!regp)
+ return 0;
return regp->sys_val;
}
@@ -854,11 +1081,15 @@ static u64 __read_sysreg_by_encoding(u32 sys_id)
switch (sys_id) {
read_sysreg_case(SYS_ID_PFR0_EL1);
read_sysreg_case(SYS_ID_PFR1_EL1);
+ read_sysreg_case(SYS_ID_PFR2_EL1);
read_sysreg_case(SYS_ID_DFR0_EL1);
+ read_sysreg_case(SYS_ID_DFR1_EL1);
read_sysreg_case(SYS_ID_MMFR0_EL1);
read_sysreg_case(SYS_ID_MMFR1_EL1);
read_sysreg_case(SYS_ID_MMFR2_EL1);
read_sysreg_case(SYS_ID_MMFR3_EL1);
+ read_sysreg_case(SYS_ID_MMFR4_EL1);
+ read_sysreg_case(SYS_ID_MMFR5_EL1);
read_sysreg_case(SYS_ID_ISAR0_EL1);
read_sysreg_case(SYS_ID_ISAR1_EL1);
read_sysreg_case(SYS_ID_ISAR2_EL1);
@@ -1409,6 +1640,21 @@ static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry,
}
#endif
+#ifdef CONFIG_ARM64_BTI
+static void bti_enable(const struct arm64_cpu_capabilities *__unused)
+{
+ /*
+ * Use of X16/X17 for tail-calls and trampolines that jump to
+ * function entry points using BR is a requirement for
+ * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI.
+ * So, be strict and forbid other BRs using other registers to
+ * jump onto a PACIxSP instruction:
+ */
+ sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1);
+ isb();
+}
+#endif /* CONFIG_ARM64_BTI */
+
/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -1511,6 +1757,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64PFR0_EL0_SHIFT,
.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
},
+#ifdef CONFIG_KVM
+ {
+ .desc = "32-bit EL1 Support",
+ .capability = ARM64_HAS_32BIT_EL1,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ .sys_reg = SYS_ID_AA64PFR0_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = ID_AA64PFR0_EL1_SHIFT,
+ .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT,
+ },
+#endif
{
.desc = "Kernel page table isolation (KPTI)",
.capability = ARM64_UNMAP_KERNEL_AT_EL0,
@@ -1779,6 +2037,23 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.min_field_value = 1,
},
#endif
+#ifdef CONFIG_ARM64_BTI
+ {
+ .desc = "Branch Target Identification",
+ .capability = ARM64_BTI,
+#ifdef CONFIG_ARM64_BTI_KERNEL
+ .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
+#else
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+#endif
+ .matches = has_cpuid_feature,
+ .cpu_enable = bti_enable,
+ .sys_reg = SYS_ID_AA64PFR1_EL1,
+ .field_pos = ID_AA64PFR1_BT_SHIFT,
+ .min_field_value = ID_AA64PFR1_BT_BTI,
+ .sign = FTR_UNSIGNED,
+ },
+#endif
{},
};
@@ -1888,6 +2163,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
#endif
HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS),
+#ifdef CONFIG_ARM64_BTI
+ HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_BT_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_BT_BTI, CAP_HWCAP, KERNEL_HWCAP_BTI),
+#endif
#ifdef CONFIG_ARM64_PTR_AUTH
HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA),
HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG),
@@ -2181,6 +2459,36 @@ static void verify_sve_features(void)
/* Add checks on other ZCR bits here if necessary */
}
+static void verify_hyp_capabilities(void)
+{
+ u64 safe_mmfr1, mmfr0, mmfr1;
+ int parange, ipa_max;
+ unsigned int safe_vmid_bits, vmid_bits;
+
+ if (!IS_ENABLED(CONFIG_KVM) || !IS_ENABLED(CONFIG_KVM_ARM_HOST))
+ return;
+
+ safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
+ mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
+ mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
+
+ /* Verify VMID bits */
+ safe_vmid_bits = get_vmid_bits(safe_mmfr1);
+ vmid_bits = get_vmid_bits(mmfr1);
+ if (vmid_bits < safe_vmid_bits) {
+ pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id());
+ cpu_die_early();
+ }
+
+ /* Verify IPA range */
+ parange = cpuid_feature_extract_unsigned_field(mmfr0,
+ ID_AA64MMFR0_PARANGE_SHIFT);
+ ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
+ if (ipa_max < get_kvm_ipa_limit()) {
+ pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id());
+ cpu_die_early();
+ }
+}
/*
* Run through the enabled system capabilities and enable() it on this CPU.
@@ -2206,6 +2514,9 @@ static void verify_local_cpu_capabilities(void)
if (system_supports_sve())
verify_sve_features();
+
+ if (is_hyp_mode_available())
+ verify_hyp_capabilities();
}
void check_local_cpu_capabilities(void)
@@ -2394,7 +2705,7 @@ static int emulate_sys_reg(u32 id, u64 *valp)
if (sys_reg_CRm(id) == 0)
return emulate_id_reg(id, valp);
- regp = get_arm64_ftr_reg(id);
+ regp = get_arm64_ftr_reg_nowarn(id);
if (regp)
*valp = arm64_ftr_reg_user_value(regp);
else
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 86136075ae41..86637466daa8 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -92,6 +92,7 @@ static const char *const hwcap_str[] = {
"bf16",
"dgh",
"rng",
+ "bti",
NULL
};
@@ -311,6 +312,8 @@ static int __init cpuinfo_regs_init(void)
}
return 0;
}
+device_initcall(cpuinfo_regs_init);
+
static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
{
unsigned int cpu = smp_processor_id();
@@ -362,6 +365,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
/* Update the 32bit ID registers only if AArch32 is implemented */
if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
+ info->reg_id_dfr1 = read_cpuid(ID_DFR1_EL1);
info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
@@ -373,8 +377,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
+ info->reg_id_mmfr4 = read_cpuid(ID_MMFR4_EL1);
+ info->reg_id_mmfr5 = read_cpuid(ID_MMFR5_EL1);
info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);
+ info->reg_id_pfr2 = read_cpuid(ID_PFR2_EL1);
info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
@@ -403,5 +410,3 @@ void __init cpuinfo_store_boot_cpu(void)
boot_cpu_data = *info;
init_cpu_features(&boot_cpu_data);
}
-
-device_initcall(cpuinfo_regs_init);
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index ca4c3e12d8c5..1f646b07e3e9 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -5,6 +5,7 @@
*/
#include <linux/crash_core.h>
+#include <asm/cpufeature.h>
#include <asm/memory.h>
void arch_crash_save_vmcoreinfo(void)
@@ -16,4 +17,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n",
PHYS_OFFSET);
vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n",
+ system_supports_address_auth() ?
+ ptrauth_kernel_pac_mask() : 0);
}
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 48222a4760c2..15e80c876d46 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -376,15 +376,13 @@ int aarch32_break_handler(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(aarch32_break_handler);
-static int __init debug_traps_init(void)
+void __init debug_traps_init(void)
{
hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
TRAP_TRACE, "single-step handler");
hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
TRAP_BRKPT, "ptrace BRK handler");
- return 0;
}
-arch_initcall(debug_traps_init);
/* Re-enable single step for syscall restarting. */
void user_rewind_single_step(struct task_struct *task)
diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S
index 1a03618df0df..0073b24b5d25 100644
--- a/arch/arm64/kernel/efi-entry.S
+++ b/arch/arm64/kernel/efi-entry.S
@@ -14,12 +14,12 @@
SYM_CODE_START(efi_enter_kernel)
/*
- * efi_entry() will have copied the kernel image if necessary and we
+ * efi_pe_entry() will have copied the kernel image if necessary and we
* end up here with device tree address in x1 and the kernel entry
* point stored in x0. Save those values in registers which are
* callee preserved.
*/
- ldr w2, =stext_offset
+ ldr w2, =primary_entry_offset
add x19, x0, x2 // relocated Image entrypoint
mov x20, x1 // DTB address
diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S
index 914999ccaf8a..df67c0f2a077 100644
--- a/arch/arm64/kernel/efi-header.S
+++ b/arch/arm64/kernel/efi-header.S
@@ -27,12 +27,12 @@ optional_header:
.long __initdata_begin - efi_header_end // SizeOfCode
.long __pecoff_data_size // SizeOfInitializedData
.long 0 // SizeOfUninitializedData
- .long __efistub_efi_entry - _head // AddressOfEntryPoint
+ .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint
.long efi_header_end - _head // BaseOfCode
extra_header_fields:
.quad 0 // ImageBase
- .long SZ_4K // SectionAlignment
+ .long SEGMENT_ALIGN // SectionAlignment
.long PECOFF_FILE_ALIGNMENT // FileAlignment
.short 0 // MajorOperatingSystemVersion
.short 0 // MinorOperatingSystemVersion
diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S
index 3fc71106cb2b..75691a2641c1 100644
--- a/arch/arm64/kernel/efi-rt-wrapper.S
+++ b/arch/arm64/kernel/efi-rt-wrapper.S
@@ -5,7 +5,7 @@
#include <linux/linkage.h>
-ENTRY(__efi_rt_asm_wrapper)
+SYM_FUNC_START(__efi_rt_asm_wrapper)
stp x29, x30, [sp, #-32]!
mov x29, sp
@@ -34,5 +34,14 @@ ENTRY(__efi_rt_asm_wrapper)
ldp x29, x30, [sp], #32
b.ne 0f
ret
-0: b efi_handle_corrupted_x18 // tail call
-ENDPROC(__efi_rt_asm_wrapper)
+0:
+ /*
+ * With CONFIG_SHADOW_CALL_STACK, the kernel uses x18 to store a
+ * shadow stack pointer, which we need to restore before returning to
+ * potentially instrumented code. This is safe because the wrapper is
+ * called with preemption disabled and a separate shadow stack is used
+ * for interrupts.
+ */
+ mov x18, x2
+ b efi_handle_corrupted_x18 // tail call
+SYM_FUNC_END(__efi_rt_asm_wrapper)
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index c839b5bf1904..3dbdf9752b11 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -94,7 +94,7 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs)
break;
default:
el1_inv(regs, esr);
- };
+ }
}
NOKPROBE_SYMBOL(el1_sync_handler);
@@ -188,6 +188,14 @@ static void notrace el0_undef(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(el0_undef);
+static void notrace el0_bti(struct pt_regs *regs)
+{
+ user_exit_irqoff();
+ local_daif_restore(DAIF_PROCCTX);
+ do_bti(regs);
+}
+NOKPROBE_SYMBOL(el0_bti);
+
static void notrace el0_inv(struct pt_regs *regs, unsigned long esr)
{
user_exit_irqoff();
@@ -255,6 +263,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs)
case ESR_ELx_EC_UNKNOWN:
el0_undef(regs);
break;
+ case ESR_ELx_EC_BTI:
+ el0_bti(regs);
+ break;
case ESR_ELx_EC_BREAKPT_LOW:
case ESR_ELx_EC_SOFTSTP_LOW:
case ESR_ELx_EC_WATCHPT_LOW:
diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
index 0f24eae8f3cc..f880dd63ddc3 100644
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -16,34 +16,34 @@
*
* x0 - pointer to struct fpsimd_state
*/
-ENTRY(fpsimd_save_state)
+SYM_FUNC_START(fpsimd_save_state)
fpsimd_save x0, 8
ret
-ENDPROC(fpsimd_save_state)
+SYM_FUNC_END(fpsimd_save_state)
/*
* Load the FP registers.
*
* x0 - pointer to struct fpsimd_state
*/
-ENTRY(fpsimd_load_state)
+SYM_FUNC_START(fpsimd_load_state)
fpsimd_restore x0, 8
ret
-ENDPROC(fpsimd_load_state)
+SYM_FUNC_END(fpsimd_load_state)
#ifdef CONFIG_ARM64_SVE
-ENTRY(sve_save_state)
+SYM_FUNC_START(sve_save_state)
sve_save 0, x1, 2
ret
-ENDPROC(sve_save_state)
+SYM_FUNC_END(sve_save_state)
-ENTRY(sve_load_state)
+SYM_FUNC_START(sve_load_state)
sve_load 0, x1, x2, 3, x4
ret
-ENDPROC(sve_load_state)
+SYM_FUNC_END(sve_load_state)
-ENTRY(sve_get_vl)
+SYM_FUNC_START(sve_get_vl)
_sve_rdvl 0, 1
ret
-ENDPROC(sve_get_vl)
+SYM_FUNC_END(sve_get_vl)
#endif /* CONFIG_ARM64_SVE */
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 833d48c9acb5..a338f40e64d3 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -23,8 +23,9 @@
*
* ... where <entry> is either ftrace_caller or ftrace_regs_caller.
*
- * Each instrumented function follows the AAPCS, so here x0-x8 and x19-x30 are
- * live, and x9-x18 are safe to clobber.
+ * Each instrumented function follows the AAPCS, so here x0-x8 and x18-x30 are
+ * live (x18 holds the Shadow Call Stack pointer), and x9-x17 are safe to
+ * clobber.
*
* We save the callsite's context into a pt_regs before invoking any ftrace
* callbacks. So that we can get a sensible backtrace, we create a stack record
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ddcde093c433..5304d193c79d 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -23,6 +23,7 @@
#include <asm/mmu.h>
#include <asm/processor.h>
#include <asm/ptrace.h>
+#include <asm/scs.h>
#include <asm/thread_info.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
@@ -178,7 +179,9 @@ alternative_cb_end
apply_ssbd 1, x22, x23
- ptrauth_keys_install_kernel tsk, 1, x20, x22, x23
+ ptrauth_keys_install_kernel tsk, x20, x22, x23
+
+ scs_load tsk, x20
.else
add x21, sp, #S_FRAME_SIZE
get_current_task tsk
@@ -343,6 +346,8 @@ alternative_else_nop_endif
msr cntkctl_el1, x1
4:
#endif
+ scs_save tsk, x0
+
/* No kernel C function calls after this as user keys are set. */
ptrauth_keys_install_user tsk, x0, x1, x2
@@ -388,6 +393,9 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
.macro irq_stack_entry
mov x19, sp // preserve the original sp
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov x24, scs_sp // preserve the original shadow stack
+#endif
/*
* Compare sp with the base of the task stack.
@@ -405,15 +413,25 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
/* switch to the irq stack */
mov sp, x26
+
+#ifdef CONFIG_SHADOW_CALL_STACK
+ /* also switch to the irq shadow stack */
+ adr_this_cpu scs_sp, irq_shadow_call_stack, x26
+#endif
+
9998:
.endm
/*
- * x19 should be preserved between irq_stack_entry and
- * irq_stack_exit.
+ * The callee-saved regs (x19-x29) should be preserved between
+ * irq_stack_entry and irq_stack_exit, but note that kernel_entry
+ * uses x20-x23 to store data for later use.
*/
.macro irq_stack_exit
mov sp, x19
+#ifdef CONFIG_SHADOW_CALL_STACK
+ mov scs_sp, x24
+#endif
.endm
/* GPRs used by entry code */
@@ -728,20 +746,9 @@ el0_error_naked:
SYM_CODE_END(el0_error)
/*
- * Ok, we need to do extra processing, enter the slow path.
- */
-work_pending:
- mov x0, sp // 'regs'
- bl do_notify_resume
-#ifdef CONFIG_TRACE_IRQFLAGS
- bl trace_hardirqs_on // enabled while in userspace
-#endif
- ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
- b finish_ret_to_user
-/*
* "slow" syscall return path.
*/
-ret_to_user:
+SYM_CODE_START_LOCAL(ret_to_user)
disable_daif
gic_prio_kentry_setup tmp=x3
ldr x1, [tsk, #TSK_TI_FLAGS]
@@ -753,7 +760,19 @@ finish_ret_to_user:
bl stackleak_erase
#endif
kernel_exit 0
-ENDPROC(ret_to_user)
+
+/*
+ * Ok, we need to do extra processing, enter the slow path.
+ */
+work_pending:
+ mov x0, sp // 'regs'
+ bl do_notify_resume
+#ifdef CONFIG_TRACE_IRQFLAGS
+ bl trace_hardirqs_on // enabled while in userspace
+#endif
+ ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
+ b finish_ret_to_user
+SYM_CODE_END(ret_to_user)
.popsection // .entry.text
@@ -900,7 +919,9 @@ SYM_FUNC_START(cpu_switch_to)
ldr lr, [x8]
mov sp, x9
msr sp_el0, x1
- ptrauth_keys_install_kernel x1, 1, x8, x9, x10
+ ptrauth_keys_install_kernel x1, x8, x9, x10
+ scs_save x0, x8
+ scs_load x1, x8
ret
SYM_FUNC_END(cpu_switch_to)
NOKPROBE(cpu_switch_to)
@@ -1029,13 +1050,16 @@ SYM_CODE_START(__sdei_asm_handler)
mov x19, x1
+#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK)
+ ldrb w4, [x19, #SDEI_EVENT_PRIORITY]
+#endif
+
#ifdef CONFIG_VMAP_STACK
/*
* entry.S may have been using sp as a scratch register, find whether
* this is a normal or critical event and switch to the appropriate
* stack for this CPU.
*/
- ldrb w4, [x19, #SDEI_EVENT_PRIORITY]
cbnz w4, 1f
ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6
b 2f
@@ -1045,6 +1069,15 @@ SYM_CODE_START(__sdei_asm_handler)
mov sp, x5
#endif
+#ifdef CONFIG_SHADOW_CALL_STACK
+ /* Use a separate shadow call stack for normal and critical events */
+ cbnz w4, 3f
+ adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal, tmp=x6
+ b 4f
+3: adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical, tmp=x6
+4:
+#endif
+
/*
* We may have interrupted userspace, or a guest, or exit-from or
* return-to either of these. We can't trust sp_el0, restore it.
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 94289d126993..35cb5e66c504 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl)
#ifdef CONFIG_SYSCTL
static int sve_proc_do_default_vl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
int vl = sve_default_vl;
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 57a91032b4c2..632702146813 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/irqchip/arm-gic-v3.h>
+#include <asm/asm_pointer_auth.h>
#include <asm/assembler.h>
#include <asm/boot.h>
#include <asm/ptrace.h>
@@ -27,6 +28,7 @@
#include <asm/pgtable-hwdef.h>
#include <asm/pgtable.h>
#include <asm/page.h>
+#include <asm/scs.h>
#include <asm/smp.h>
#include <asm/sysreg.h>
#include <asm/thread_info.h>
@@ -70,9 +72,9 @@ _head:
* its opcode forms the magic "MZ" signature required by UEFI.
*/
add x13, x18, #0x16
- b stext
+ b primary_entry
#else
- b stext // branch to kernel start, magic
+ b primary_entry // branch to kernel start, magic
.long 0 // reserved
#endif
le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian
@@ -98,14 +100,13 @@ pe_header:
* primary lowlevel boot path:
*
* Register Scope Purpose
- * x21 stext() .. start_kernel() FDT pointer passed at boot in x0
- * x23 stext() .. start_kernel() physical misalignment/KASLR offset
- * x28 __create_page_tables() callee preserved temp register
- * x19/x20 __primary_switch() callee preserved temp registers
- * x24 __primary_switch() .. relocate_kernel()
- * current RELR displacement
+ * x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0
+ * x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset
+ * x28 __create_page_tables() callee preserved temp register
+ * x19/x20 __primary_switch() callee preserved temp registers
+ * x24 __primary_switch() .. relocate_kernel() current RELR displacement
*/
-SYM_CODE_START(stext)
+SYM_CODE_START(primary_entry)
bl preserve_boot_args
bl el2_setup // Drop to EL1, w0=cpu_boot_mode
adrp x23, __PHYS_OFFSET
@@ -118,10 +119,9 @@ SYM_CODE_START(stext)
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
- mov x0, #ARM64_CPU_BOOT_PRIMARY
bl __cpu_setup // initialise processor
b __primary_switch
-SYM_CODE_END(stext)
+SYM_CODE_END(primary_entry)
/*
* Preserve the arguments passed by the bootloader in x0 .. x3
@@ -394,13 +394,19 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
/*
* Since the page tables have been populated with non-cacheable
- * accesses (MMU disabled), invalidate the idmap and swapper page
- * tables again to remove any speculatively loaded cache lines.
+ * accesses (MMU disabled), invalidate those tables again to
+ * remove any speculatively loaded cache lines.
*/
+ dmb sy
+
adrp x0, idmap_pg_dir
+ adrp x1, idmap_pg_end
+ sub x1, x1, x0
+ bl __inval_dcache_area
+
+ adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0
- dmb sy
bl __inval_dcache_area
ret x28
@@ -417,6 +423,10 @@ SYM_FUNC_START_LOCAL(__primary_switched)
adr_l x5, init_task
msr sp_el0, x5 // Save thread_info
+#ifdef CONFIG_ARM64_PTR_AUTH
+ __ptrauth_keys_init_cpu x5, x6, x7, x8
+#endif
+
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb
@@ -424,6 +434,10 @@ SYM_FUNC_START_LOCAL(__primary_switched)
stp xzr, x30, [sp, #-16]!
mov x29, sp
+#ifdef CONFIG_SHADOW_CALL_STACK
+ adr_l scs_sp, init_shadow_call_stack // Set shadow call stack
+#endif
+
str_l x21, __fdt_pointer, x5 // Save FDT pointer
ldr_l x4, kimage_vaddr // Save the offset between
@@ -717,7 +731,6 @@ SYM_FUNC_START_LOCAL(secondary_startup)
* Common entry point for secondary CPUs.
*/
bl __cpu_secondary_check52bitva
- mov x0, #ARM64_CPU_BOOT_SECONDARY
bl __cpu_setup // initialise processor
adrp x1, swapper_pg_dir
bl __enable_mmu
@@ -737,8 +750,14 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
ldr x2, [x0, #CPU_BOOT_TASK]
cbz x2, __secondary_too_slow
msr sp_el0, x2
+ scs_load x2, x3
mov x29, #0
mov x30, #0
+
+#ifdef CONFIG_ARM64_PTR_AUTH
+ ptrauth_keys_init_cpu x2, x3, x4, x5
+#endif
+
b secondary_start_kernel
SYM_FUNC_END(__secondary_switched)
diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S
index 6532105b3e32..8ccca660034e 100644
--- a/arch/arm64/kernel/hibernate-asm.S
+++ b/arch/arm64/kernel/hibernate-asm.S
@@ -65,7 +65,7 @@
* x5: physical address of a zero page that remains zero after resume
*/
.pushsection ".hibernate_exit.text", "ax"
-ENTRY(swsusp_arch_suspend_exit)
+SYM_CODE_START(swsusp_arch_suspend_exit)
/*
* We execute from ttbr0, change ttbr1 to our copied linear map tables
* with a break-before-make via the zero page
@@ -110,7 +110,7 @@ ENTRY(swsusp_arch_suspend_exit)
cbz x24, 3f /* Do we need to re-initialise EL2? */
hvc #0
3: ret
-ENDPROC(swsusp_arch_suspend_exit)
+SYM_CODE_END(swsusp_arch_suspend_exit)
/*
* Restore the hyp stub.
@@ -119,15 +119,15 @@ ENDPROC(swsusp_arch_suspend_exit)
*
* x24: The physical address of __hyp_stub_vectors
*/
-el1_sync:
+SYM_CODE_START_LOCAL(el1_sync)
msr vbar_el2, x24
eret
-ENDPROC(el1_sync)
+SYM_CODE_END(el1_sync)
.macro invalid_vector label
-\label:
+SYM_CODE_START_LOCAL(\label)
b \label
-ENDPROC(\label)
+SYM_CODE_END(\label)
.endm
invalid_vector el2_sync_invalid
@@ -141,7 +141,7 @@ ENDPROC(\label)
/* el2 vectors - switch el2 here while we restore the memory image. */
.align 11
-ENTRY(hibernate_el2_vectors)
+SYM_CODE_START(hibernate_el2_vectors)
ventry el2_sync_invalid // Synchronous EL2t
ventry el2_irq_invalid // IRQ EL2t
ventry el2_fiq_invalid // FIQ EL2t
@@ -161,6 +161,6 @@ ENTRY(hibernate_el2_vectors)
ventry el1_irq_invalid // IRQ 32-bit EL1
ventry el1_fiq_invalid // FIQ 32-bit EL1
ventry el1_error_invalid // Error 32-bit EL1
-END(hibernate_el2_vectors)
+SYM_CODE_END(hibernate_el2_vectors)
.popsection
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 5b73e92c99e3..a8a4b55f3a09 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -184,6 +184,7 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
pgprot_t pgprot)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
@@ -196,7 +197,15 @@ static int trans_pgd_map_page(pgd_t *trans_pgd, void *page,
pgd_populate(&init_mm, pgdp, pudp);
}
- pudp = pud_offset(pgdp, dst_addr);
+ p4dp = p4d_offset(pgdp, dst_addr);
+ if (p4d_none(READ_ONCE(*p4dp))) {
+ pudp = (void *)get_safe_page(GFP_ATOMIC);
+ if (!pudp)
+ return -ENOMEM;
+ p4d_populate(&init_mm, p4dp, pudp);
+ }
+
+ pudp = pud_offset(p4dp, dst_addr);
if (pud_none(READ_ONCE(*pudp))) {
pmdp = (void *)get_safe_page(GFP_ATOMIC);
if (!pmdp)
@@ -419,7 +428,7 @@ static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start,
return 0;
}
-static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
+static int copy_pud(p4d_t *dst_p4dp, p4d_t *src_p4dp, unsigned long start,
unsigned long end)
{
pud_t *dst_pudp;
@@ -427,15 +436,15 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
unsigned long next;
unsigned long addr = start;
- if (pgd_none(READ_ONCE(*dst_pgdp))) {
+ if (p4d_none(READ_ONCE(*dst_p4dp))) {
dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!dst_pudp)
return -ENOMEM;
- pgd_populate(&init_mm, dst_pgdp, dst_pudp);
+ p4d_populate(&init_mm, dst_p4dp, dst_pudp);
}
- dst_pudp = pud_offset(dst_pgdp, start);
+ dst_pudp = pud_offset(dst_p4dp, start);
- src_pudp = pud_offset(src_pgdp, start);
+ src_pudp = pud_offset(src_p4dp, start);
do {
pud_t pud = READ_ONCE(*src_pudp);
@@ -454,6 +463,27 @@ static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
return 0;
}
+static int copy_p4d(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start,
+ unsigned long end)
+{
+ p4d_t *dst_p4dp;
+ p4d_t *src_p4dp;
+ unsigned long next;
+ unsigned long addr = start;
+
+ dst_p4dp = p4d_offset(dst_pgdp, start);
+ src_p4dp = p4d_offset(src_pgdp, start);
+ do {
+ next = p4d_addr_end(addr, end);
+ if (p4d_none(READ_ONCE(*src_p4dp)))
+ continue;
+ if (copy_pud(dst_p4dp, src_p4dp, addr, next))
+ return -ENOMEM;
+ } while (dst_p4dp++, src_p4dp++, addr = next, addr != end);
+
+ return 0;
+}
+
static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start,
unsigned long end)
{
@@ -466,7 +496,7 @@ static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start,
next = pgd_addr_end(addr, end);
if (pgd_none(READ_ONCE(*src_pgdp)))
continue;
- if (copy_pud(dst_pgdp, src_pgdp, addr, next))
+ if (copy_p4d(dst_pgdp, src_pgdp, addr, next))
return -ENOMEM;
} while (dst_pgdp++, src_pgdp++, addr = next, addr != end);
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index e473ead806ed..160f5881a0b7 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -21,7 +21,7 @@
.align 11
-ENTRY(__hyp_stub_vectors)
+SYM_CODE_START(__hyp_stub_vectors)
ventry el2_sync_invalid // Synchronous EL2t
ventry el2_irq_invalid // IRQ EL2t
ventry el2_fiq_invalid // FIQ EL2t
@@ -41,11 +41,11 @@ ENTRY(__hyp_stub_vectors)
ventry el1_irq_invalid // IRQ 32-bit EL1
ventry el1_fiq_invalid // FIQ 32-bit EL1
ventry el1_error_invalid // Error 32-bit EL1
-ENDPROC(__hyp_stub_vectors)
+SYM_CODE_END(__hyp_stub_vectors)
.align 11
-el1_sync:
+SYM_CODE_START_LOCAL(el1_sync)
cmp x0, #HVC_SET_VECTORS
b.ne 2f
msr vbar_el2, x1
@@ -68,12 +68,12 @@ el1_sync:
9: mov x0, xzr
eret
-ENDPROC(el1_sync)
+SYM_CODE_END(el1_sync)
.macro invalid_vector label
-\label:
+SYM_CODE_START_LOCAL(\label)
b \label
-ENDPROC(\label)
+SYM_CODE_END(\label)
.endm
invalid_vector el2_sync_invalid
@@ -106,15 +106,15 @@ ENDPROC(\label)
* initialisation entry point.
*/
-ENTRY(__hyp_set_vectors)
+SYM_FUNC_START(__hyp_set_vectors)
mov x1, x0
mov x0, #HVC_SET_VECTORS
hvc #0
ret
-ENDPROC(__hyp_set_vectors)
+SYM_FUNC_END(__hyp_set_vectors)
-ENTRY(__hyp_reset_vectors)
+SYM_FUNC_START(__hyp_reset_vectors)
mov x0, #HVC_RESET_VECTORS
hvc #0
ret
-ENDPROC(__hyp_reset_vectors)
+SYM_FUNC_END(__hyp_reset_vectors)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 7f06ad93fc95..be0a63ffed23 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -13,7 +13,7 @@
#ifdef CONFIG_EFI
__efistub_kernel_size = _edata - _text;
-__efistub_stext_offset = stext - _text;
+__efistub_primary_entry_offset = primary_entry - _text;
/*
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 4a9e773a177f..684d871ae38d 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -51,21 +51,33 @@ enum aarch64_insn_encoding_class __kprobes aarch64_get_insn_class(u32 insn)
return aarch64_insn_encoding_class[(insn >> 25) & 0xf];
}
-/* NOP is an alias of HINT */
-bool __kprobes aarch64_insn_is_nop(u32 insn)
+bool __kprobes aarch64_insn_is_steppable_hint(u32 insn)
{
if (!aarch64_insn_is_hint(insn))
return false;
switch (insn & 0xFE0) {
- case AARCH64_INSN_HINT_YIELD:
- case AARCH64_INSN_HINT_WFE:
- case AARCH64_INSN_HINT_WFI:
- case AARCH64_INSN_HINT_SEV:
- case AARCH64_INSN_HINT_SEVL:
- return false;
- default:
+ case AARCH64_INSN_HINT_XPACLRI:
+ case AARCH64_INSN_HINT_PACIA_1716:
+ case AARCH64_INSN_HINT_PACIB_1716:
+ case AARCH64_INSN_HINT_AUTIA_1716:
+ case AARCH64_INSN_HINT_AUTIB_1716:
+ case AARCH64_INSN_HINT_PACIAZ:
+ case AARCH64_INSN_HINT_PACIASP:
+ case AARCH64_INSN_HINT_PACIBZ:
+ case AARCH64_INSN_HINT_PACIBSP:
+ case AARCH64_INSN_HINT_AUTIAZ:
+ case AARCH64_INSN_HINT_AUTIASP:
+ case AARCH64_INSN_HINT_AUTIBZ:
+ case AARCH64_INSN_HINT_AUTIBSP:
+ case AARCH64_INSN_HINT_BTI:
+ case AARCH64_INSN_HINT_BTIC:
+ case AARCH64_INSN_HINT_BTIJ:
+ case AARCH64_INSN_HINT_BTIJC:
+ case AARCH64_INSN_HINT_NOP:
return true;
+ default:
+ return false;
}
}
@@ -574,7 +586,7 @@ u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
offset >> 2);
}
-u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_op op)
+u32 __kprobes aarch64_insn_gen_hint(enum aarch64_insn_hint_cr_op op)
{
return aarch64_insn_get_hint_value() | op;
}
@@ -1535,16 +1547,10 @@ static u32 aarch64_encode_immediate(u64 imm,
u32 insn)
{
unsigned int immr, imms, n, ones, ror, esz, tmp;
- u64 mask = ~0UL;
-
- /* Can't encode full zeroes or full ones */
- if (!imm || !~imm)
- return AARCH64_BREAK_FAULT;
+ u64 mask;
switch (variant) {
case AARCH64_INSN_VARIANT_32BIT:
- if (upper_32_bits(imm))
- return AARCH64_BREAK_FAULT;
esz = 32;
break;
case AARCH64_INSN_VARIANT_64BIT:
@@ -1556,6 +1562,12 @@ static u32 aarch64_encode_immediate(u64 imm,
return AARCH64_BREAK_FAULT;
}
+ mask = GENMASK(esz - 1, 0);
+
+ /* Can't encode full zeroes, full ones, or value wider than the mask */
+ if (!imm || imm == mask || imm & ~mask)
+ return AARCH64_BREAK_FAULT;
+
/*
* Inverse of Replicate(). Try to spot a repeating pattern
* with a pow2 stride.
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 8e9c924423b4..a0b144cfaea7 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -177,6 +177,7 @@ void machine_kexec(struct kimage *kimage)
* the offline CPUs. Therefore, we must use the __* variant here.
*/
__flush_icache_range((uintptr_t)reboot_code_buffer,
+ (uintptr_t)reboot_code_buffer +
arm64_relocate_new_kernel_size);
/* Flush the kimage list and its buffers. */
diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c
index b40c3b0def92..522e6f517ec0 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -138,12 +138,12 @@ static int setup_dtb(struct kimage *image,
/* add rng-seed */
if (rng_is_initialized()) {
- u8 rng_seed[RNG_SEED_SIZE];
- get_random_bytes(rng_seed, RNG_SEED_SIZE);
- ret = fdt_setprop(dtb, off, FDT_PROP_RNG_SEED, rng_seed,
- RNG_SEED_SIZE);
+ void *rng_seed;
+ ret = fdt_setprop_placeholder(dtb, off, FDT_PROP_RNG_SEED,
+ RNG_SEED_SIZE, &rng_seed);
if (ret)
goto out;
+ get_random_bytes(rng_seed, RNG_SEED_SIZE);
} else {
pr_notice("RNG is not initialised: omitting \"%s\" property\n",
FDT_PROP_RNG_SEED);
@@ -284,7 +284,7 @@ int load_other_segments(struct kimage *image,
image->arch.elf_headers_sz = headers_sz;
pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_headers_mem, headers_sz, headers_sz);
+ image->arch.elf_headers_mem, kbuf.bufsz, kbuf.memsz);
}
/* load initrd */
@@ -305,7 +305,7 @@ int load_other_segments(struct kimage *image,
initrd_load_addr = kbuf.mem;
pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- initrd_load_addr, initrd_len, initrd_len);
+ initrd_load_addr, kbuf.bufsz, kbuf.memsz);
}
/* load dtb */
@@ -332,7 +332,7 @@ int load_other_segments(struct kimage *image,
image->arch.dtb_mem = kbuf.mem;
pr_debug("Loaded dtb at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kbuf.mem, dtb_len, dtb_len);
+ kbuf.mem, kbuf.bufsz, kbuf.memsz);
return 0;
diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c
index 1ef702b0be2d..295d66490584 100644
--- a/arch/arm64/kernel/paravirt.c
+++ b/arch/arm64/kernel/paravirt.c
@@ -120,7 +120,7 @@ static bool has_pv_steal_clock(void)
struct arm_smccc_res res;
/* To detect the presence of PV time support we require SMCCC 1.1+ */
- if (psci_ops.smccc_version < SMCCC_VERSION_1_1)
+ if (arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)
return false;
arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index b78fac9e546c..263d5fba4c8a 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -46,7 +46,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
* except for the NOP case.
*/
if (aarch64_insn_is_hint(insn))
- return aarch64_insn_is_nop(insn);
+ return aarch64_insn_is_steppable_hint(insn);
return true;
}
diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S
index 45dce03aaeaf..890ca72c5a51 100644
--- a/arch/arm64/kernel/probes/kprobes_trampoline.S
+++ b/arch/arm64/kernel/probes/kprobes_trampoline.S
@@ -61,7 +61,7 @@
ldp x28, x29, [sp, #S_X28]
.endm
-ENTRY(kretprobe_trampoline)
+SYM_CODE_START(kretprobe_trampoline)
sub sp, sp, #S_FRAME_SIZE
save_all_base_regs
@@ -79,4 +79,4 @@ ENTRY(kretprobe_trampoline)
add sp, sp, #S_FRAME_SIZE
ret
-ENDPROC(kretprobe_trampoline)
+SYM_CODE_END(kretprobe_trampoline)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 56be4cbf771f..eade7807e819 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -11,6 +11,7 @@
#include <linux/compat.h>
#include <linux/efi.h>
+#include <linux/elf.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
@@ -18,6 +19,7 @@
#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
+#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
@@ -209,6 +211,15 @@ void machine_restart(char *cmd)
while (1);
}
+#define bstr(suffix, str) [PSR_BTYPE_ ## suffix >> PSR_BTYPE_SHIFT] = str
+static const char *const btypes[] = {
+ bstr(NONE, "--"),
+ bstr( JC, "jc"),
+ bstr( C, "-c"),
+ bstr( J , "j-")
+};
+#undef bstr
+
static void print_pstate(struct pt_regs *regs)
{
u64 pstate = regs->pstate;
@@ -227,7 +238,10 @@ static void print_pstate(struct pt_regs *regs)
pstate & PSR_AA32_I_BIT ? 'I' : 'i',
pstate & PSR_AA32_F_BIT ? 'F' : 'f');
} else {
- printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO)\n",
+ const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >>
+ PSR_BTYPE_SHIFT];
+
+ printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO BTYPE=%s)\n",
pstate,
pstate & PSR_N_BIT ? 'N' : 'n',
pstate & PSR_Z_BIT ? 'Z' : 'z',
@@ -238,7 +252,8 @@ static void print_pstate(struct pt_regs *regs)
pstate & PSR_I_BIT ? 'I' : 'i',
pstate & PSR_F_BIT ? 'F' : 'f',
pstate & PSR_PAN_BIT ? '+' : '-',
- pstate & PSR_UAO_BIT ? '+' : '-');
+ pstate & PSR_UAO_BIT ? '+' : '-',
+ btype_str);
}
}
@@ -655,3 +670,25 @@ asmlinkage void __sched arm64_preempt_schedule_irq(void)
if (system_capabilities_finalized())
preempt_schedule_irq();
}
+
+#ifdef CONFIG_BINFMT_ELF
+int arch_elf_adjust_prot(int prot, const struct arch_elf_state *state,
+ bool has_interp, bool is_interp)
+{
+ /*
+ * For dynamically linked executables the interpreter is
+ * responsible for setting PROT_BTI on everything except
+ * itself.
+ */
+ if (is_interp != has_interp)
+ return prot;
+
+ if (!(state->flags & ARM64_ELF_BTI))
+ return prot;
+
+ if (prot & PROT_EXEC)
+ prot |= PROT_BTI;
+
+ return prot;
+}
+#endif
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b3d3005d9515..76790a5f2a0d 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1829,10 +1829,11 @@ static void tracehook_report_syscall(struct pt_regs *regs,
int syscall_trace_enter(struct pt_regs *regs)
{
- if (test_thread_flag(TIF_SYSCALL_TRACE) ||
- test_thread_flag(TIF_SYSCALL_EMU)) {
+ unsigned long flags = READ_ONCE(current_thread_info()->flags);
+
+ if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) {
tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
- if (!in_syscall(regs) || test_thread_flag(TIF_SYSCALL_EMU))
+ if (!in_syscall(regs) || (flags & _TIF_SYSCALL_EMU))
return -1;
}
@@ -1874,7 +1875,7 @@ void syscall_trace_exit(struct pt_regs *regs)
*/
#define SPSR_EL1_AARCH64_RES0_BITS \
(GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \
- GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5))
+ GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5))
#define SPSR_EL1_AARCH32_RES0_BITS \
(GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20))
diff --git a/arch/arm64/kernel/reloc_test_syms.S b/arch/arm64/kernel/reloc_test_syms.S
index 16a34f188f26..c50f45fa29fa 100644
--- a/arch/arm64/kernel/reloc_test_syms.S
+++ b/arch/arm64/kernel/reloc_test_syms.S
@@ -5,81 +5,81 @@
#include <linux/linkage.h>
-ENTRY(absolute_data64)
+SYM_FUNC_START(absolute_data64)
ldr x0, 0f
ret
0: .quad sym64_abs
-ENDPROC(absolute_data64)
+SYM_FUNC_END(absolute_data64)
-ENTRY(absolute_data32)
+SYM_FUNC_START(absolute_data32)
ldr w0, 0f
ret
0: .long sym32_abs
-ENDPROC(absolute_data32)
+SYM_FUNC_END(absolute_data32)
-ENTRY(absolute_data16)
+SYM_FUNC_START(absolute_data16)
adr x0, 0f
ldrh w0, [x0]
ret
0: .short sym16_abs, 0
-ENDPROC(absolute_data16)
+SYM_FUNC_END(absolute_data16)
-ENTRY(signed_movw)
+SYM_FUNC_START(signed_movw)
movz x0, #:abs_g2_s:sym64_abs
movk x0, #:abs_g1_nc:sym64_abs
movk x0, #:abs_g0_nc:sym64_abs
ret
-ENDPROC(signed_movw)
+SYM_FUNC_END(signed_movw)
-ENTRY(unsigned_movw)
+SYM_FUNC_START(unsigned_movw)
movz x0, #:abs_g3:sym64_abs
movk x0, #:abs_g2_nc:sym64_abs
movk x0, #:abs_g1_nc:sym64_abs
movk x0, #:abs_g0_nc:sym64_abs
ret
-ENDPROC(unsigned_movw)
+SYM_FUNC_END(unsigned_movw)
.align 12
.space 0xff8
-ENTRY(relative_adrp)
+SYM_FUNC_START(relative_adrp)
adrp x0, sym64_rel
add x0, x0, #:lo12:sym64_rel
ret
-ENDPROC(relative_adrp)
+SYM_FUNC_END(relative_adrp)
.align 12
.space 0xffc
-ENTRY(relative_adrp_far)
+SYM_FUNC_START(relative_adrp_far)
adrp x0, memstart_addr
add x0, x0, #:lo12:memstart_addr
ret
-ENDPROC(relative_adrp_far)
+SYM_FUNC_END(relative_adrp_far)
-ENTRY(relative_adr)
+SYM_FUNC_START(relative_adr)
adr x0, sym64_rel
ret
-ENDPROC(relative_adr)
+SYM_FUNC_END(relative_adr)
-ENTRY(relative_data64)
+SYM_FUNC_START(relative_data64)
adr x1, 0f
ldr x0, [x1]
add x0, x0, x1
ret
0: .quad sym64_rel - .
-ENDPROC(relative_data64)
+SYM_FUNC_END(relative_data64)
-ENTRY(relative_data32)
+SYM_FUNC_START(relative_data32)
adr x1, 0f
ldr w0, [x1]
add x0, x0, x1
ret
0: .long sym64_rel - .
-ENDPROC(relative_data32)
+SYM_FUNC_END(relative_data32)
-ENTRY(relative_data16)
+SYM_FUNC_START(relative_data16)
adr x1, 0f
ldrsh w0, [x1]
add x0, x0, x1
ret
0: .short sym64_rel - ., 0
-ENDPROC(relative_data16)
+SYM_FUNC_END(relative_data16)
diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S
index c40ce496c78b..542d6edc6806 100644
--- a/arch/arm64/kernel/relocate_kernel.S
+++ b/arch/arm64/kernel/relocate_kernel.S
@@ -26,7 +26,7 @@
* control_code_page, a special page which has been set up to be preserved
* during the copy operation.
*/
-ENTRY(arm64_relocate_new_kernel)
+SYM_CODE_START(arm64_relocate_new_kernel)
/* Setup the list loop variables. */
mov x18, x2 /* x18 = dtb address */
@@ -111,7 +111,7 @@ ENTRY(arm64_relocate_new_kernel)
mov x3, xzr
br x17
-ENDPROC(arm64_relocate_new_kernel)
+SYM_CODE_END(arm64_relocate_new_kernel)
.align 3 /* To keep the 64-bit values below naturally aligned. */
diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c
new file mode 100644
index 000000000000..e8f7ff45dd8f
--- /dev/null
+++ b/arch/arm64/kernel/scs.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Shadow Call Stack support.
+ *
+ * Copyright (C) 2019 Google LLC
+ */
+
+#include <linux/percpu.h>
+#include <linux/scs.h>
+
+DEFINE_SCS(irq_shadow_call_stack);
+
+#ifdef CONFIG_ARM_SDE_INTERFACE
+DEFINE_SCS(sdei_shadow_call_stack_normal);
+DEFINE_SCS(sdei_shadow_call_stack_critical);
+#endif
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index d6259dac62b6..dab88260b137 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -95,19 +95,7 @@ static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info)
unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
unsigned long high = low + SDEI_STACK_SIZE;
- if (!low)
- return false;
-
- if (sp < low || sp >= high)
- return false;
-
- if (info) {
- info->low = low;
- info->high = high;
- info->type = STACK_TYPE_SDEI_NORMAL;
- }
-
- return true;
+ return on_stack(sp, low, high, STACK_TYPE_SDEI_NORMAL, info);
}
static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info)
@@ -115,19 +103,7 @@ static bool on_sdei_critical_stack(unsigned long sp, struct stack_info *info)
unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
unsigned long high = low + SDEI_STACK_SIZE;
- if (!low)
- return false;
-
- if (sp < low || sp >= high)
- return false;
-
- if (info) {
- info->low = low;
- info->high = high;
- info->type = STACK_TYPE_SDEI_CRITICAL;
- }
-
- return true;
+ return on_stack(sp, low, high, STACK_TYPE_SDEI_CRITICAL, info);
}
bool _on_sdei_stack(unsigned long sp, struct stack_info *info)
@@ -251,22 +227,12 @@ asmlinkage __kprobes notrace unsigned long
__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg)
{
unsigned long ret;
- bool do_nmi_exit = false;
- /*
- * nmi_enter() deals with printk() re-entrance and use of RCU when
- * RCU believed this CPU was idle. Because critical events can
- * interrupt normal events, we may already be in_nmi().
- */
- if (!in_nmi()) {
- nmi_enter();
- do_nmi_exit = true;
- }
+ nmi_enter();
ret = _sdei_handler(regs, arg);
- if (do_nmi_exit)
- nmi_exit();
+ nmi_exit();
return ret;
}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 339882db5a91..801d56cdf701 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -732,6 +732,22 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
regs->regs[29] = (unsigned long)&user->next_frame->fp;
regs->pc = (unsigned long)ka->sa.sa_handler;
+ /*
+ * Signal delivery is a (wacky) indirect function call in
+ * userspace, so simulate the same setting of BTYPE as a BLR
+ * <register containing the signal handler entry point>.
+ * Signal delivery to a location in a PROT_BTI guarded page
+ * that is not a function entry point will now trigger a
+ * SIGILL in userspace.
+ *
+ * If the signal handler entry point is not in a PROT_BTI
+ * guarded page, this is harmless.
+ */
+ if (system_supports_bti()) {
+ regs->pstate &= ~PSR_BTYPE_MASK;
+ regs->pstate |= PSR_BTYPE_C;
+ }
+
if (ka->sa.sa_flags & SA_RESTORER)
sigtramp = ka->sa.sa_restorer;
else
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 7b2f2e650c44..ba40d57757d6 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -62,7 +62,7 @@
*
* x0 = struct sleep_stack_data area
*/
-ENTRY(__cpu_suspend_enter)
+SYM_FUNC_START(__cpu_suspend_enter)
stp x29, lr, [x0, #SLEEP_STACK_DATA_CALLEE_REGS]
stp x19, x20, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+16]
stp x21, x22, [x0,#SLEEP_STACK_DATA_CALLEE_REGS+32]
@@ -95,23 +95,22 @@ ENTRY(__cpu_suspend_enter)
ldp x29, lr, [sp], #16
mov x0, #1
ret
-ENDPROC(__cpu_suspend_enter)
+SYM_FUNC_END(__cpu_suspend_enter)
.pushsection ".idmap.text", "awx"
-ENTRY(cpu_resume)
+SYM_CODE_START(cpu_resume)
bl el2_setup // if in EL2 drop to EL1 cleanly
- mov x0, #ARM64_CPU_RUNTIME
bl __cpu_setup
/* enable the MMU early - so we can access sleep_save_stash by va */
adrp x1, swapper_pg_dir
bl __enable_mmu
ldr x8, =_cpu_resume
br x8
-ENDPROC(cpu_resume)
+SYM_CODE_END(cpu_resume)
.ltorg
.popsection
-ENTRY(_cpu_resume)
+SYM_FUNC_START(_cpu_resume)
mrs x1, mpidr_el1
adr_l x8, mpidr_hash // x8 = struct mpidr_hash virt address
@@ -147,4 +146,4 @@ ENTRY(_cpu_resume)
ldp x29, lr, [x29]
mov x0, #0
ret
-ENDPROC(_cpu_resume)
+SYM_FUNC_END(_cpu_resume)
diff --git a/arch/arm64/kernel/smccc-call.S b/arch/arm64/kernel/smccc-call.S
index 54655273d1e0..1f93809528a4 100644
--- a/arch/arm64/kernel/smccc-call.S
+++ b/arch/arm64/kernel/smccc-call.S
@@ -30,9 +30,9 @@
* unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
* struct arm_smccc_quirk *quirk)
*/
-ENTRY(__arm_smccc_smc)
+SYM_FUNC_START(__arm_smccc_smc)
SMCCC smc
-ENDPROC(__arm_smccc_smc)
+SYM_FUNC_END(__arm_smccc_smc)
EXPORT_SYMBOL(__arm_smccc_smc)
/*
@@ -41,7 +41,7 @@ EXPORT_SYMBOL(__arm_smccc_smc)
* unsigned long a6, unsigned long a7, struct arm_smccc_res *res,
* struct arm_smccc_quirk *quirk)
*/
-ENTRY(__arm_smccc_hvc)
+SYM_FUNC_START(__arm_smccc_hvc)
SMCCC hvc
-ENDPROC(__arm_smccc_hvc)
+SYM_FUNC_END(__arm_smccc_hvc)
EXPORT_SYMBOL(__arm_smccc_hvc)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 061f60fe452f..4b6f4999d06a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -65,7 +65,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
*/
struct secondary_data secondary_data;
/* Number of CPUs which aren't online, but looping in kernel text. */
-int cpus_stuck_in_kernel;
+static int cpus_stuck_in_kernel;
enum ipi_msg_type {
IPI_RESCHEDULE,
@@ -114,10 +114,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
*/
secondary_data.task = idle;
secondary_data.stack = task_stack_page(idle) + THREAD_SIZE;
-#if defined(CONFIG_ARM64_PTR_AUTH)
- secondary_data.ptrauth_key.apia.lo = idle->thread.keys_kernel.apia.lo;
- secondary_data.ptrauth_key.apia.hi = idle->thread.keys_kernel.apia.hi;
-#endif
update_cpu_boot_status(CPU_MMU_OFF);
__flush_dcache_area(&secondary_data, sizeof(secondary_data));
@@ -140,10 +136,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
pr_crit("CPU%u: failed to come online\n", cpu);
secondary_data.task = NULL;
secondary_data.stack = NULL;
-#if defined(CONFIG_ARM64_PTR_AUTH)
- secondary_data.ptrauth_key.apia.lo = 0;
- secondary_data.ptrauth_key.apia.hi = 0;
-#endif
__flush_dcache_area(&secondary_data, sizeof(secondary_data));
status = READ_ONCE(secondary_data.status);
if (status == CPU_MMU_OFF)
@@ -176,7 +168,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
panic("CPU%u detected unsupported configuration\n", cpu);
}
- return ret;
+ return -EIO;
}
static void init_gic_priority_masking(void)
@@ -430,7 +422,7 @@ static void __init hyp_mode_check(void)
"CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
- if (IS_ENABLED(CONFIG_KVM_ARM_HOST))
+ if (IS_ENABLED(CONFIG_KVM))
kvm_compute_layout();
}
diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c
index a12c0c88d345..5f5b868292f5 100644
--- a/arch/arm64/kernel/syscall.c
+++ b/arch/arm64/kernel/syscall.c
@@ -98,6 +98,24 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
regs->orig_x0 = regs->regs[0];
regs->syscallno = scno;
+ /*
+ * BTI note:
+ * The architecture does not guarantee that SPSR.BTYPE is zero
+ * on taking an SVC, so we could return to userspace with a
+ * non-zero BTYPE after the syscall.
+ *
+ * This shouldn't matter except when userspace is explicitly
+ * doing something stupid, such as setting PROT_BTI on a page
+ * that lacks conforming BTI/PACIxSP instructions, falling
+ * through from one executable page to another with differing
+ * PROT_BTI, or messing with BTYPE via ptrace: in such cases,
+ * userspace should not be surprised if a SIGILL occurs on
+ * syscall return.
+ *
+ * So, don't touch regs->pstate & PSR_BTYPE_MASK here.
+ * (Similarly for HVC and SMC elsewhere.)
+ */
+
cortex_a76_erratum_1463225_svc_handler();
local_daif_restore(DAIF_PROCCTX);
user_exit();
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index cf402be5c573..d332590f5978 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -272,6 +272,61 @@ void arm64_notify_die(const char *str, struct pt_regs *regs,
}
}
+#ifdef CONFIG_COMPAT
+#define PSTATE_IT_1_0_SHIFT 25
+#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT)
+#define PSTATE_IT_7_2_SHIFT 10
+#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT)
+
+static u32 compat_get_it_state(struct pt_regs *regs)
+{
+ u32 it, pstate = regs->pstate;
+
+ it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT;
+ it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2;
+
+ return it;
+}
+
+static void compat_set_it_state(struct pt_regs *regs, u32 it)
+{
+ u32 pstate_it;
+
+ pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK;
+ pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK;
+
+ regs->pstate &= ~PSR_AA32_IT_MASK;
+ regs->pstate |= pstate_it;
+}
+
+static void advance_itstate(struct pt_regs *regs)
+{
+ u32 it;
+
+ /* ARM mode */
+ if (!(regs->pstate & PSR_AA32_T_BIT) ||
+ !(regs->pstate & PSR_AA32_IT_MASK))
+ return;
+
+ it = compat_get_it_state(regs);
+
+ /*
+ * If this is the last instruction of the block, wipe the IT
+ * state. Otherwise advance it.
+ */
+ if (!(it & 7))
+ it = 0;
+ else
+ it = (it & 0xe0) | ((it << 1) & 0x1f);
+
+ compat_set_it_state(regs, it);
+}
+#else
+static void advance_itstate(struct pt_regs *regs)
+{
+}
+#endif
+
void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size)
{
regs->pc += size;
@@ -282,6 +337,11 @@ void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size)
*/
if (user_mode(regs))
user_fastforward_single_step(current);
+
+ if (compat_user_mode(regs))
+ advance_itstate(regs);
+ else
+ regs->pstate &= ~PSR_BTYPE_MASK;
}
static LIST_HEAD(undef_hook);
@@ -411,6 +471,13 @@ void do_undefinstr(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_undefinstr);
+void do_bti(struct pt_regs *regs)
+{
+ BUG_ON(!user_mode(regs));
+ force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc);
+}
+NOKPROBE_SYMBOL(do_bti);
+
#define __user_cache_maint(insn, address, res) \
if (address >= user_addr_max()) { \
res = -EFAULT; \
@@ -566,34 +633,7 @@ static const struct sys64_hook sys64_hooks[] = {
{},
};
-
#ifdef CONFIG_COMPAT
-#define PSTATE_IT_1_0_SHIFT 25
-#define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT)
-#define PSTATE_IT_7_2_SHIFT 10
-#define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT)
-
-static u32 compat_get_it_state(struct pt_regs *regs)
-{
- u32 it, pstate = regs->pstate;
-
- it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT;
- it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2;
-
- return it;
-}
-
-static void compat_set_it_state(struct pt_regs *regs, u32 it)
-{
- u32 pstate_it;
-
- pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK;
- pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK;
-
- regs->pstate &= ~PSR_AA32_IT_MASK;
- regs->pstate |= pstate_it;
-}
-
static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs)
{
int cond;
@@ -614,42 +654,12 @@ static bool cp15_cond_valid(unsigned int esr, struct pt_regs *regs)
return aarch32_opcode_cond_checks[cond](regs->pstate);
}
-static void advance_itstate(struct pt_regs *regs)
-{
- u32 it;
-
- /* ARM mode */
- if (!(regs->pstate & PSR_AA32_T_BIT) ||
- !(regs->pstate & PSR_AA32_IT_MASK))
- return;
-
- it = compat_get_it_state(regs);
-
- /*
- * If this is the last instruction of the block, wipe the IT
- * state. Otherwise advance it.
- */
- if (!(it & 7))
- it = 0;
- else
- it = (it & 0xe0) | ((it << 1) & 0x1f);
-
- compat_set_it_state(regs, it);
-}
-
-static void arm64_compat_skip_faulting_instruction(struct pt_regs *regs,
- unsigned int sz)
-{
- advance_itstate(regs);
- arm64_skip_faulting_instruction(regs, sz);
-}
-
static void compat_cntfrq_read_handler(unsigned int esr, struct pt_regs *regs)
{
int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT;
pt_regs_write_reg(regs, reg, arch_timer_get_rate());
- arm64_compat_skip_faulting_instruction(regs, 4);
+ arm64_skip_faulting_instruction(regs, 4);
}
static const struct sys64_hook cp15_32_hooks[] = {
@@ -669,7 +679,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs)
pt_regs_write_reg(regs, rt, lower_32_bits(val));
pt_regs_write_reg(regs, rt2, upper_32_bits(val));
- arm64_compat_skip_faulting_instruction(regs, 4);
+ arm64_skip_faulting_instruction(regs, 4);
}
static const struct sys64_hook cp15_64_hooks[] = {
@@ -690,7 +700,7 @@ void do_cp15instr(unsigned int esr, struct pt_regs *regs)
* There is no T16 variant of a CP access, so we
* always advance PC by 4 bytes.
*/
- arm64_compat_skip_faulting_instruction(regs, 4);
+ arm64_skip_faulting_instruction(regs, 4);
return;
}
@@ -753,6 +763,7 @@ static const char *esr_class_str[] = {
[ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS",
[ESR_ELx_EC_PAC] = "PAC",
[ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC",
+ [ESR_ELx_EC_BTI] = "BTI",
[ESR_ELx_EC_ILL] = "PSTATE.IL",
[ESR_ELx_EC_SVC32] = "SVC (AArch32)",
[ESR_ELx_EC_HVC32] = "HVC (AArch32)",
@@ -906,17 +917,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr)
asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr)
{
- const bool was_in_nmi = in_nmi();
-
- if (!was_in_nmi)
- nmi_enter();
+ nmi_enter();
/* non-RAS errors are not containable */
if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr))
arm64_serror_panic(regs, esr);
- if (!was_in_nmi)
- nmi_exit();
+ nmi_exit();
}
asmlinkage void enter_from_user_mode(void)
@@ -1047,11 +1054,11 @@ int __init early_brk64(unsigned long addr, unsigned int esr,
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
}
-/* This registration must happen early, before debug_traps_init(). */
void __init trap_init(void)
{
register_kernel_break_hook(&bug_break_hook);
#ifdef CONFIG_KASAN_SW_TAGS
register_kernel_break_hook(&kasan_break_hook);
#endif
+ debug_traps_init();
}
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 033a48f30dbb..d51a898fd60f 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -33,20 +33,14 @@ extern char vdso_start[], vdso_end[];
extern char vdso32_start[], vdso32_end[];
#endif /* CONFIG_COMPAT_VDSO */
-/* vdso_lookup arch_index */
-enum arch_vdso_type {
- ARM64_VDSO = 0,
+enum vdso_abi {
+ VDSO_ABI_AA64,
#ifdef CONFIG_COMPAT_VDSO
- ARM64_VDSO32 = 1,
+ VDSO_ABI_AA32,
#endif /* CONFIG_COMPAT_VDSO */
};
-#ifdef CONFIG_COMPAT_VDSO
-#define VDSO_TYPES (ARM64_VDSO32 + 1)
-#else
-#define VDSO_TYPES (ARM64_VDSO + 1)
-#endif /* CONFIG_COMPAT_VDSO */
-struct __vdso_abi {
+struct vdso_abi_info {
const char *name;
const char *vdso_code_start;
const char *vdso_code_end;
@@ -57,14 +51,14 @@ struct __vdso_abi {
struct vm_special_mapping *cm;
};
-static struct __vdso_abi vdso_lookup[VDSO_TYPES] __ro_after_init = {
- {
+static struct vdso_abi_info vdso_info[] __ro_after_init = {
+ [VDSO_ABI_AA64] = {
.name = "vdso",
.vdso_code_start = vdso_start,
.vdso_code_end = vdso_end,
},
#ifdef CONFIG_COMPAT_VDSO
- {
+ [VDSO_ABI_AA32] = {
.name = "vdso32",
.vdso_code_start = vdso32_start,
.vdso_code_end = vdso32_end,
@@ -81,13 +75,13 @@ static union {
} vdso_data_store __page_aligned_data;
struct vdso_data *vdso_data = vdso_data_store.data;
-static int __vdso_remap(enum arch_vdso_type arch_index,
+static int __vdso_remap(enum vdso_abi abi,
const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
- unsigned long vdso_size = vdso_lookup[arch_index].vdso_code_end -
- vdso_lookup[arch_index].vdso_code_start;
+ unsigned long vdso_size = vdso_info[abi].vdso_code_end -
+ vdso_info[abi].vdso_code_start;
if (vdso_size != new_size)
return -EINVAL;
@@ -97,24 +91,24 @@ static int __vdso_remap(enum arch_vdso_type arch_index,
return 0;
}
-static int __vdso_init(enum arch_vdso_type arch_index)
+static int __vdso_init(enum vdso_abi abi)
{
int i;
struct page **vdso_pagelist;
unsigned long pfn;
- if (memcmp(vdso_lookup[arch_index].vdso_code_start, "\177ELF", 4)) {
+ if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) {
pr_err("vDSO is not a valid ELF object!\n");
return -EINVAL;
}
- vdso_lookup[arch_index].vdso_pages = (
- vdso_lookup[arch_index].vdso_code_end -
- vdso_lookup[arch_index].vdso_code_start) >>
+ vdso_info[abi].vdso_pages = (
+ vdso_info[abi].vdso_code_end -
+ vdso_info[abi].vdso_code_start) >>
PAGE_SHIFT;
/* Allocate the vDSO pagelist, plus a page for the data. */
- vdso_pagelist = kcalloc(vdso_lookup[arch_index].vdso_pages + 1,
+ vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1,
sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
@@ -125,26 +119,27 @@ static int __vdso_init(enum arch_vdso_type arch_index)
/* Grab the vDSO code pages. */
- pfn = sym_to_pfn(vdso_lookup[arch_index].vdso_code_start);
+ pfn = sym_to_pfn(vdso_info[abi].vdso_code_start);
- for (i = 0; i < vdso_lookup[arch_index].vdso_pages; i++)
+ for (i = 0; i < vdso_info[abi].vdso_pages; i++)
vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
- vdso_lookup[arch_index].dm->pages = &vdso_pagelist[0];
- vdso_lookup[arch_index].cm->pages = &vdso_pagelist[1];
+ vdso_info[abi].dm->pages = &vdso_pagelist[0];
+ vdso_info[abi].cm->pages = &vdso_pagelist[1];
return 0;
}
-static int __setup_additional_pages(enum arch_vdso_type arch_index,
+static int __setup_additional_pages(enum vdso_abi abi,
struct mm_struct *mm,
struct linux_binprm *bprm,
int uses_interp)
{
unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
+ unsigned long gp_flags = 0;
void *ret;
- vdso_text_len = vdso_lookup[arch_index].vdso_pages << PAGE_SHIFT;
+ vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT;
/* Be sure to map the data page */
vdso_mapping_len = vdso_text_len + PAGE_SIZE;
@@ -156,16 +151,19 @@ static int __setup_additional_pages(enum arch_vdso_type arch_index,
ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
VM_READ|VM_MAYREAD,
- vdso_lookup[arch_index].dm);
+ vdso_info[abi].dm);
if (IS_ERR(ret))
goto up_fail;
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
+ gp_flags = VM_ARM64_BTI;
+
vdso_base += PAGE_SIZE;
mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
- VM_READ|VM_EXEC|
+ VM_READ|VM_EXEC|gp_flags|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- vdso_lookup[arch_index].cm);
+ vdso_info[abi].cm);
if (IS_ERR(ret))
goto up_fail;
@@ -184,46 +182,42 @@ up_fail:
static int aarch32_vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
- return __vdso_remap(ARM64_VDSO32, sm, new_vma);
+ return __vdso_remap(VDSO_ABI_AA32, sm, new_vma);
}
#endif /* CONFIG_COMPAT_VDSO */
-/*
- * aarch32_vdso_pages:
- * 0 - kuser helpers
- * 1 - sigreturn code
- * or (CONFIG_COMPAT_VDSO):
- * 0 - kuser helpers
- * 1 - vdso data
- * 2 - vdso code
- */
-#define C_VECTORS 0
+enum aarch32_map {
+ AA32_MAP_VECTORS, /* kuser helpers */
#ifdef CONFIG_COMPAT_VDSO
-#define C_VVAR 1
-#define C_VDSO 2
-#define C_PAGES (C_VDSO + 1)
+ AA32_MAP_VVAR,
+ AA32_MAP_VDSO,
#else
-#define C_SIGPAGE 1
-#define C_PAGES (C_SIGPAGE + 1)
-#endif /* CONFIG_COMPAT_VDSO */
-static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init;
-static struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = {
- {
+ AA32_MAP_SIGPAGE
+#endif
+};
+
+static struct page *aarch32_vectors_page __ro_after_init;
+#ifndef CONFIG_COMPAT_VDSO
+static struct page *aarch32_sig_page __ro_after_init;
+#endif
+
+static struct vm_special_mapping aarch32_vdso_maps[] = {
+ [AA32_MAP_VECTORS] = {
.name = "[vectors]", /* ABI */
- .pages = &aarch32_vdso_pages[C_VECTORS],
+ .pages = &aarch32_vectors_page,
},
#ifdef CONFIG_COMPAT_VDSO
- {
+ [AA32_MAP_VVAR] = {
.name = "[vvar]",
},
- {
+ [AA32_MAP_VDSO] = {
.name = "[vdso]",
.mremap = aarch32_vdso_mremap,
},
#else
- {
+ [AA32_MAP_SIGPAGE] = {
.name = "[sigpage]", /* ABI */
- .pages = &aarch32_vdso_pages[C_SIGPAGE],
+ .pages = &aarch32_sig_page,
},
#endif /* CONFIG_COMPAT_VDSO */
};
@@ -243,8 +237,8 @@ static int aarch32_alloc_kuser_vdso_page(void)
memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start,
kuser_sz);
- aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page);
- flush_dcache_page(aarch32_vdso_pages[C_VECTORS]);
+ aarch32_vectors_page = virt_to_page(vdso_page);
+ flush_dcache_page(aarch32_vectors_page);
return 0;
}
@@ -253,10 +247,10 @@ static int __aarch32_alloc_vdso_pages(void)
{
int ret;
- vdso_lookup[ARM64_VDSO32].dm = &aarch32_vdso_spec[C_VVAR];
- vdso_lookup[ARM64_VDSO32].cm = &aarch32_vdso_spec[C_VDSO];
+ vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR];
+ vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO];
- ret = __vdso_init(ARM64_VDSO32);
+ ret = __vdso_init(VDSO_ABI_AA32);
if (ret)
return ret;
@@ -275,8 +269,8 @@ static int __aarch32_alloc_vdso_pages(void)
return -ENOMEM;
memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz);
- aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage);
- flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]);
+ aarch32_sig_page = virt_to_page(sigpage);
+ flush_dcache_page(aarch32_sig_page);
ret = aarch32_alloc_kuser_vdso_page();
if (ret)
@@ -306,7 +300,7 @@ static int aarch32_kuser_helpers_setup(struct mm_struct *mm)
ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE,
VM_READ | VM_EXEC |
VM_MAYREAD | VM_MAYEXEC,
- &aarch32_vdso_spec[C_VECTORS]);
+ &aarch32_vdso_maps[AA32_MAP_VECTORS]);
return PTR_ERR_OR_ZERO(ret);
}
@@ -330,7 +324,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
ret = _install_special_mapping(mm, addr, PAGE_SIZE,
VM_READ | VM_EXEC | VM_MAYREAD |
VM_MAYWRITE | VM_MAYEXEC,
- &aarch32_vdso_spec[C_SIGPAGE]);
+ &aarch32_vdso_maps[AA32_MAP_SIGPAGE]);
if (IS_ERR(ret))
goto out;
@@ -354,7 +348,7 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto out;
#ifdef CONFIG_COMPAT_VDSO
- ret = __setup_additional_pages(ARM64_VDSO32,
+ ret = __setup_additional_pages(VDSO_ABI_AA32,
mm,
bprm,
uses_interp);
@@ -371,22 +365,19 @@ out:
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
- return __vdso_remap(ARM64_VDSO, sm, new_vma);
+ return __vdso_remap(VDSO_ABI_AA64, sm, new_vma);
}
-/*
- * aarch64_vdso_pages:
- * 0 - vvar
- * 1 - vdso
- */
-#define A_VVAR 0
-#define A_VDSO 1
-#define A_PAGES (A_VDSO + 1)
-static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = {
- {
+enum aarch64_map {
+ AA64_MAP_VVAR,
+ AA64_MAP_VDSO,
+};
+
+static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = {
+ [AA64_MAP_VVAR] = {
.name = "[vvar]",
},
- {
+ [AA64_MAP_VDSO] = {
.name = "[vdso]",
.mremap = vdso_mremap,
},
@@ -394,10 +385,10 @@ static struct vm_special_mapping vdso_spec[A_PAGES] __ro_after_init = {
static int __init vdso_init(void)
{
- vdso_lookup[ARM64_VDSO].dm = &vdso_spec[A_VVAR];
- vdso_lookup[ARM64_VDSO].cm = &vdso_spec[A_VDSO];
+ vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR];
+ vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO];
- return __vdso_init(ARM64_VDSO);
+ return __vdso_init(VDSO_ABI_AA64);
}
arch_initcall(vdso_init);
@@ -410,7 +401,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = __setup_additional_pages(ARM64_VDSO,
+ ret = __setup_additional_pages(VDSO_ABI_AA64,
mm,
bprm,
uses_interp);
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 3862cad2410c..556d424c6f52 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -17,15 +17,19 @@ obj-vdso := vgettimeofday.o note.o sigreturn.o
targets := $(obj-vdso) vdso.so vdso.so.dbg
obj-vdso := $(addprefix $(obj)/, $(obj-vdso))
+btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti
+
+# -Bsymbolic has been added for consistency with arm, the compat vDSO and
+# potential future proofing if we end up with internal calls to the exported
+# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so
+# preparation in build-time C")).
ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \
- --build-id -n -T
+ -Bsymbolic --eh-frame-hdr --build-id -n $(btildflags-y) -T
ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
ccflags-y += -DDISABLE_BRANCH_PROFILING
-VDSO_LDFLAGS := -Bsymbolic
-
-CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os
+CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS)
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
diff --git a/arch/arm64/kernel/vdso/note.S b/arch/arm64/kernel/vdso/note.S
index 0ce6ec75a525..3d4e82290c80 100644
--- a/arch/arm64/kernel/vdso/note.S
+++ b/arch/arm64/kernel/vdso/note.S
@@ -12,9 +12,12 @@
#include <linux/version.h>
#include <linux/elfnote.h>
#include <linux/build-salt.h>
+#include <asm/assembler.h>
ELFNOTE_START(Linux, 0, "a")
.long LINUX_VERSION_CODE
ELFNOTE_END
BUILD_SALT
+
+emit_aarch64_feature_1_and
diff --git a/arch/arm64/kernel/vdso/sigreturn.S b/arch/arm64/kernel/vdso/sigreturn.S
index 12324863d5c2..620a3ef837b7 100644
--- a/arch/arm64/kernel/vdso/sigreturn.S
+++ b/arch/arm64/kernel/vdso/sigreturn.S
@@ -1,7 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Sigreturn trampoline for returning from a signal when the SA_RESTORER
- * flag is not set.
+ * flag is not set. It serves primarily as a hall of shame for crappy
+ * unwinders and features an exciting but mysterious NOP instruction.
+ *
+ * It's also fragile as hell, so please think twice before changing anything
+ * in here.
*
* Copyright (C) 2012 ARM Limited
*
@@ -9,18 +13,54 @@
*/
#include <linux/linkage.h>
+#include <asm/assembler.h>
#include <asm/unistd.h>
.text
- nop
-SYM_FUNC_START(__kernel_rt_sigreturn)
+/* Ensure that the mysterious NOP can be associated with a function. */
.cfi_startproc
+
+/*
+ * .cfi_signal_frame causes the corresponding Frame Description Entry in the
+ * .eh_frame section to be annotated as a signal frame. This allows DWARF
+ * unwinders (e.g. libstdc++) to implement _Unwind_GetIPInfo(), which permits
+ * unwinding out of the signal trampoline without the need for the mysterious
+ * NOP.
+ */
.cfi_signal_frame
- .cfi_def_cfa x29, 0
- .cfi_offset x29, 0 * 8
- .cfi_offset x30, 1 * 8
+
+/*
+ * Tell the unwinder where to locate the frame record linking back to the
+ * interrupted context. We don't provide unwind info for registers other
+ * than the frame pointer and the link register here; in practice, this
+ * is sufficient for unwinding in C/C++ based runtimes and the values in
+ * the sigcontext may have been modified by this point anyway. Debuggers
+ * already have baked-in strategies for attempting to unwind out of signals.
+ */
+ .cfi_def_cfa x29, 0
+ .cfi_offset x29, 0 * 8
+ .cfi_offset x30, 1 * 8
+
+/*
+ * This mysterious NOP is required for some unwinders (e.g. libc++) that
+ * unconditionally subtract one from the result of _Unwind_GetIP() in order to
+ * identify the calling function.
+ * Hack borrowed from arch/powerpc/kernel/vdso64/sigtramp.S.
+ */
+ nop // Mysterious NOP
+
+/*
+ * GDB relies on being able to identify the sigreturn instruction sequence to
+ * unwind from signal handlers. We cannot, therefore, use SYM_FUNC_START()
+ * here, as it will emit a BTI C instruction and break the unwinder. Thankfully,
+ * this function is only ever called from a RET and so omitting the landing pad
+ * is perfectly fine.
+ */
+SYM_CODE_START(__kernel_rt_sigreturn)
mov x8, #__NR_rt_sigreturn
svc #0
.cfi_endproc
-SYM_FUNC_END(__kernel_rt_sigreturn)
+SYM_CODE_END(__kernel_rt_sigreturn)
+
+emit_aarch64_feature_1_and
diff --git a/arch/arm64/kernel/vdso/vdso.S b/arch/arm64/kernel/vdso/vdso.S
index d1414fee5274..c4b1990bf2be 100644
--- a/arch/arm64/kernel/vdso/vdso.S
+++ b/arch/arm64/kernel/vdso/vdso.S
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/const.h>
+#include <asm/assembler.h>
#include <asm/page.h>
.globl vdso_start, vdso_end
@@ -19,3 +20,5 @@ vdso_start:
vdso_end:
.previous
+
+emit_aarch64_feature_1_and
diff --git a/arch/arm64/kernel/vdso32/sigreturn.S b/arch/arm64/kernel/vdso32/sigreturn.S
index 620524969696..b0091064c3d6 100644
--- a/arch/arm64/kernel/vdso32/sigreturn.S
+++ b/arch/arm64/kernel/vdso32/sigreturn.S
@@ -3,6 +3,9 @@
* This file provides both A32 and T32 versions, in accordance with the
* arm sigreturn code.
*
+ * Please read the comments in arch/arm64/kernel/vdso/sigreturn.S to
+ * understand some of the craziness in here.
+ *
* Copyright (C) 2018 ARM Limited
*/
@@ -17,39 +20,39 @@
.save {r0-r15}
.pad #COMPAT_SIGFRAME_REGS_OFFSET
nop
-SYM_FUNC_START(__kernel_sigreturn_arm)
+SYM_CODE_START(__kernel_sigreturn_arm)
mov r7, #__NR_compat_sigreturn
svc #0
.fnend
-SYM_FUNC_END(__kernel_sigreturn_arm)
+SYM_CODE_END(__kernel_sigreturn_arm)
.fnstart
.save {r0-r15}
.pad #COMPAT_RT_SIGFRAME_REGS_OFFSET
nop
-SYM_FUNC_START(__kernel_rt_sigreturn_arm)
+SYM_CODE_START(__kernel_rt_sigreturn_arm)
mov r7, #__NR_compat_rt_sigreturn
svc #0
.fnend
-SYM_FUNC_END(__kernel_rt_sigreturn_arm)
+SYM_CODE_END(__kernel_rt_sigreturn_arm)
.thumb
.fnstart
.save {r0-r15}
.pad #COMPAT_SIGFRAME_REGS_OFFSET
nop
-SYM_FUNC_START(__kernel_sigreturn_thumb)
+SYM_CODE_START(__kernel_sigreturn_thumb)
mov r7, #__NR_compat_sigreturn
svc #0
.fnend
-SYM_FUNC_END(__kernel_sigreturn_thumb)
+SYM_CODE_END(__kernel_sigreturn_thumb)
.fnstart
.save {r0-r15}
.pad #COMPAT_RT_SIGFRAME_REGS_OFFSET
nop
-SYM_FUNC_START(__kernel_rt_sigreturn_thumb)
+SYM_CODE_START(__kernel_rt_sigreturn_thumb)
mov r7, #__NR_compat_rt_sigreturn
svc #0
.fnend
-SYM_FUNC_END(__kernel_rt_sigreturn_thumb)
+SYM_CODE_END(__kernel_rt_sigreturn_thumb)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 497f9675071d..3be632177631 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -17,10 +17,6 @@
#include "image.h"
-/* .exit.text needed in case of alternative patching */
-#define ARM_EXIT_KEEP(x) x
-#define ARM_EXIT_DISCARD(x)
-
OUTPUT_ARCH(aarch64)
ENTRY(_text)
@@ -72,8 +68,8 @@ jiffies = jiffies_64;
/*
* The size of the PE/COFF section that covers the kernel image, which
- * runs from stext to _edata, must be a round multiple of the PE/COFF
- * FileAlignment, which we set to its minimum value of 0x200. 'stext'
+ * runs from _stext to _edata, must be a round multiple of the PE/COFF
+ * FileAlignment, which we set to its minimum value of 0x200. '_stext'
* itself is 4 KB aligned, so padding out _edata to a 0x200 aligned
* boundary should be sufficient.
*/
@@ -95,8 +91,6 @@ SECTIONS
* order of matching.
*/
/DISCARD/ : {
- ARM_EXIT_DISCARD(EXIT_TEXT)
- ARM_EXIT_DISCARD(EXIT_DATA)
EXIT_CALL
*(.discard)
*(.discard.*)
@@ -139,6 +133,7 @@ SECTIONS
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
+ idmap_pg_end = .;
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
@@ -161,7 +156,7 @@ SECTIONS
__exittext_begin = .;
.exit.text : {
- ARM_EXIT_KEEP(EXIT_TEXT)
+ EXIT_TEXT
}
__exittext_end = .;
@@ -175,7 +170,7 @@ SECTIONS
*(.altinstr_replacement)
}
- . = ALIGN(PAGE_SIZE);
+ . = ALIGN(SEGMENT_ALIGN);
__inittext_end = .;
__initdata_begin = .;
@@ -188,7 +183,7 @@ SECTIONS
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
- ARM_EXIT_KEEP(EXIT_DATA)
+ EXIT_DATA
}
PERCPU_SECTION(L1_CACHE_BYTES)
@@ -246,6 +241,7 @@ SECTIONS
. += INIT_DIR_SIZE;
init_pg_end = .;
+ . = ALIGN(SEGMENT_ALIGN);
__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
_end = .;
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449386d76441..f1c1f981482c 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -3,7 +3,6 @@
# KVM configuration
#
-source "virt/kvm/Kconfig"
source "virt/lib/Kconfig"
menuconfig VIRTUALIZATION
@@ -18,7 +17,7 @@ menuconfig VIRTUALIZATION
if VIRTUALIZATION
-config KVM
+menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on OF
# for TASKSTATS/TASK_DELAY_ACCT:
@@ -28,13 +27,11 @@ config KVM
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_ARCH_TLB_FLUSH_ALL
select KVM_MMIO
- select KVM_ARM_HOST
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select SRCU
select KVM_VFIO
select HAVE_KVM_EVENTFD
select HAVE_KVM_IRQFD
- select KVM_ARM_PMU if HW_PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_IRQCHIP
select HAVE_KVM_IRQ_ROUTING
@@ -45,23 +42,24 @@ config KVM
select TASK_DELAY_ACCT
---help---
Support hosting virtualized guest machines.
- We don't support KVM with 16K page tables yet, due to the multiple
- levels of fake page tables.
If unsure, say N.
-config KVM_ARM_HOST
- bool
- ---help---
- Provides host support for ARM processors.
+if KVM
+
+source "virt/kvm/Kconfig"
config KVM_ARM_PMU
- bool
+ bool "Virtual Performance Monitoring Unit (PMU) support"
+ depends on HW_PERF_EVENTS
+ default y
---help---
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
config KVM_INDIRECT_VECTORS
- def_bool KVM && (HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS)
+ def_bool HARDEN_BRANCH_PREDICTOR || HARDEN_EL2_VECTORS
+
+endif # KVM
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 5ffbdc39e780..8d3d9513cbfe 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -3,37 +3,25 @@
# Makefile for Kernel-based Virtual Machine module
#
-ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic
+ccflags-y += -I $(srctree)/$(src)
KVM=../../../virt/kvm
-obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
-obj-$(CONFIG_KVM_ARM_HOST) += hyp/
+obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM) += hyp/
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/psci.o $(KVM)/arm/perf.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hypercalls.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/pvtime.o
+kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
+ $(KVM)/vfio.o $(KVM)/irqchip.o \
+ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
+ inject_fault.o regmap.o va_layout.o hyp.o hyp-init.o handle_exit.o \
+ guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o \
+ vgic-sys-reg-v3.o fpsimd.o pmu.o \
+ aarch32.o arch_timer.o \
+ vgic/vgic.o vgic/vgic-init.o \
+ vgic/vgic-irqfd.o vgic/vgic-v2.o \
+ vgic/vgic-v3.o vgic/vgic-v4.o \
+ vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
+ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
+ vgic/vgic-its.o vgic/vgic-debug.o
-kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o va_layout.o
-kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
-kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o fpsimd.o pmu.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
-
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-v4.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
-kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
+kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o
diff --git a/arch/arm64/kvm/aarch32.c b/arch/arm64/kvm/aarch32.c
new file mode 100644
index 000000000000..0a356aa91aa1
--- /dev/null
+++ b/arch/arm64/kvm/aarch32.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#define DFSR_FSC_EXTABT_LPAE 0x10
+#define DFSR_FSC_EXTABT_nLPAE 0x08
+#define DFSR_LPAE BIT(9)
+
+/*
+ * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
+ */
+static const u8 return_offsets[8][2] = {
+ [0] = { 0, 0 }, /* Reset, unused */
+ [1] = { 4, 2 }, /* Undefined */
+ [2] = { 0, 0 }, /* SVC, unused */
+ [3] = { 4, 4 }, /* Prefetch abort */
+ [4] = { 8, 8 }, /* Data abort */
+ [5] = { 0, 0 }, /* HVC, unused */
+ [6] = { 4, 4 }, /* IRQ, unused */
+ [7] = { 4, 4 }, /* FIQ, unused */
+};
+
+/*
+ * When an exception is taken, most CPSR fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]).
+ *
+ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
+ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
+ * obsoleted by the ARMv7 virtualization extensions and is RES0.
+ *
+ * For the SPSR layout seen from AArch32, see:
+ * - ARM DDI 0406C.d, page B1-1148
+ * - ARM DDI 0487E.a, page G8-6264
+ *
+ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
+ * - ARM DDI 0487E.a, page C5-426
+ *
+ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+ u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+ unsigned long old, new;
+
+ old = *vcpu_cpsr(vcpu);
+ new = 0;
+
+ new |= (old & PSR_AA32_N_BIT);
+ new |= (old & PSR_AA32_Z_BIT);
+ new |= (old & PSR_AA32_C_BIT);
+ new |= (old & PSR_AA32_V_BIT);
+ new |= (old & PSR_AA32_Q_BIT);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See ARM DDI 0487E.a, section G1.12.3
+ // See ARM DDI 0406C.d, section B1.8.3
+
+ new |= (old & PSR_AA32_DIT_BIT);
+
+ // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
+ // See ARM DDI 0487E.a, page G8-6244
+ if (sctlr & BIT(31))
+ new |= PSR_AA32_SSBS_BIT;
+
+ // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
+ // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
+ // See ARM DDI 0487E.a, page G8-6246
+ new |= (old & PSR_AA32_PAN_BIT);
+ if (!(sctlr & BIT(23)))
+ new |= PSR_AA32_PAN_BIT;
+
+ // SS does not exist in AArch32, so ignore
+
+ // CPSR.IL is set to zero upon any exception
+ // See ARM DDI 0487E.a, page G1-5527
+
+ new |= (old & PSR_AA32_GE_MASK);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See prior comment above
+
+ // CPSR.E is set to SCTLR.EE upon any exception
+ // See ARM DDI 0487E.a, page G8-6245
+ // See ARM DDI 0406C.d, page B4-1701
+ if (sctlr & BIT(25))
+ new |= PSR_AA32_E_BIT;
+
+ // CPSR.A is unchanged upon an exception to Undefined, Supervisor
+ // CPSR.A is set upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_A_BIT);
+ if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
+ new |= PSR_AA32_A_BIT;
+
+ // CPSR.I is set upon any exception
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= PSR_AA32_I_BIT;
+
+ // CPSR.F is set upon an exception to FIQ
+ // CPSR.F is unchanged upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_F_BIT);
+ if (mode == PSR_AA32_MODE_FIQ)
+ new |= PSR_AA32_F_BIT;
+
+ // CPSR.T is set to SCTLR.TE upon any exception
+ // See ARM DDI 0487E.a, page G8-5514
+ // See ARM DDI 0406C.d, page B1-1181
+ if (sctlr & BIT(30))
+ new |= PSR_AA32_T_BIT;
+
+ new |= mode;
+
+ return new;
+}
+
+static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
+{
+ unsigned long spsr = *vcpu_cpsr(vcpu);
+ bool is_thumb = (spsr & PSR_AA32_T_BIT);
+ u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
+ u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+
+ *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
+
+ /* Note: These now point to the banked copies */
+ vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
+ *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
+
+ /* Branch to exception vector */
+ if (sctlr & (1 << 13))
+ vect_offset += 0xffff0000;
+ else /* always have security exceptions */
+ vect_offset += vcpu_cp15(vcpu, c12_VBAR);
+
+ *vcpu_pc(vcpu) = vect_offset;
+}
+
+void kvm_inject_undef32(struct kvm_vcpu *vcpu)
+{
+ prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
+}
+
+/*
+ * Modelled after TakeDataAbortException() and TakePrefetchAbortException
+ * pseudocode.
+ */
+static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt,
+ unsigned long addr)
+{
+ u32 vect_offset;
+ u32 *far, *fsr;
+ bool is_lpae;
+
+ if (is_pabt) {
+ vect_offset = 12;
+ far = &vcpu_cp15(vcpu, c6_IFAR);
+ fsr = &vcpu_cp15(vcpu, c5_IFSR);
+ } else { /* !iabt */
+ vect_offset = 16;
+ far = &vcpu_cp15(vcpu, c6_DFAR);
+ fsr = &vcpu_cp15(vcpu, c5_DFSR);
+ }
+
+ prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
+
+ *far = addr;
+
+ /* Give the guest an IMPLEMENTATION DEFINED exception */
+ is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
+ if (is_lpae) {
+ *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
+ } else {
+ /* no need to shuffle FS[4] into DFSR[10] as its 0 */
+ *fsr = DFSR_FSC_EXTABT_nLPAE;
+ }
+}
+
+void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+ inject_abt32(vcpu, false, addr);
+}
+
+void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr)
+{
+ inject_abt32(vcpu, true, addr);
+}
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
new file mode 100644
index 000000000000..a1fe0ea3254e
--- /dev/null
+++ b/arch/arm64/kvm/arch_timer.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/uaccess.h>
+
+#include <clocksource/arm_arch_timer.h>
+#include <asm/arch_timer.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+#include <kvm/arm_vgic.h>
+#include <kvm/arm_arch_timer.h>
+
+#include "trace.h"
+
+static struct timecounter *timecounter;
+static unsigned int host_vtimer_irq;
+static unsigned int host_ptimer_irq;
+static u32 host_vtimer_irq_flags;
+static u32 host_ptimer_irq_flags;
+
+static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
+
+static const struct kvm_irq_level default_ptimer_irq = {
+ .irq = 30,
+ .level = 1,
+};
+
+static const struct kvm_irq_level default_vtimer_irq = {
+ .irq = 27,
+ .level = 1,
+};
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+ struct arch_timer_context *timer_ctx);
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg,
+ u64 val);
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg);
+
+u64 kvm_phys_timer_read(void)
+{
+ return timecounter->cc->read(timecounter->cc);
+}
+
+static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+ if (has_vhe()) {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_ptimer = NULL;
+ } else {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = NULL;
+ map->emul_ptimer = vcpu_ptimer(vcpu);
+ }
+
+ trace_kvm_get_timer_map(vcpu->vcpu_id, map);
+}
+
+static inline bool userspace_irqchip(struct kvm *kvm)
+{
+ return static_branch_unlikely(&userspace_irqchip_in_use) &&
+ unlikely(!irqchip_in_kernel(kvm));
+}
+
+static void soft_timer_start(struct hrtimer *hrt, u64 ns)
+{
+ hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
+ HRTIMER_MODE_ABS_HARD);
+}
+
+static void soft_timer_cancel(struct hrtimer *hrt)
+{
+ hrtimer_cancel(hrt);
+}
+
+static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
+{
+ struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
+ struct arch_timer_context *ctx;
+ struct timer_map map;
+
+ /*
+ * We may see a timer interrupt after vcpu_put() has been called which
+ * sets the CPU's vcpu pointer to NULL, because even though the timer
+ * has been disabled in timer_save_state(), the hardware interrupt
+ * signal may not have been retired from the interrupt controller yet.
+ */
+ if (!vcpu)
+ return IRQ_HANDLED;
+
+ get_timer_map(vcpu, &map);
+
+ if (irq == host_vtimer_irq)
+ ctx = map.direct_vtimer;
+ else
+ ctx = map.direct_ptimer;
+
+ if (kvm_timer_should_fire(ctx))
+ kvm_timer_update_irq(vcpu, true, ctx);
+
+ if (userspace_irqchip(vcpu->kvm) &&
+ !static_branch_unlikely(&has_gic_active_state))
+ disable_percpu_irq(host_vtimer_irq);
+
+ return IRQ_HANDLED;
+}
+
+static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
+{
+ u64 cval, now;
+
+ cval = timer_ctx->cnt_cval;
+ now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+ if (now < cval) {
+ u64 ns;
+
+ ns = cyclecounter_cyc2ns(timecounter->cc,
+ cval - now,
+ timecounter->mask,
+ &timecounter->frac);
+ return ns;
+ }
+
+ return 0;
+}
+
+static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
+{
+ WARN_ON(timer_ctx && timer_ctx->loaded);
+ return timer_ctx &&
+ !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+ (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
+}
+
+/*
+ * Returns the earliest expiration time in ns among guest timers.
+ * Note that it will return 0 if none of timers can fire.
+ */
+static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
+{
+ u64 min_delta = ULLONG_MAX;
+ int i;
+
+ for (i = 0; i < NR_KVM_TIMERS; i++) {
+ struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
+
+ WARN(ctx->loaded, "timer %d loaded\n", i);
+ if (kvm_timer_irq_can_fire(ctx))
+ min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
+ }
+
+ /* If none of timers can fire, then return 0 */
+ if (min_delta == ULLONG_MAX)
+ return 0;
+
+ return min_delta;
+}
+
+static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
+{
+ struct arch_timer_cpu *timer;
+ struct kvm_vcpu *vcpu;
+ u64 ns;
+
+ timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
+ vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
+
+ /*
+ * Check that the timer has really expired from the guest's
+ * PoV (NTP on the host may have forced it to expire
+ * early). If we should have slept longer, restart it.
+ */
+ ns = kvm_timer_earliest_exp(vcpu);
+ if (unlikely(ns)) {
+ hrtimer_forward_now(hrt, ns_to_ktime(ns));
+ return HRTIMER_RESTART;
+ }
+
+ kvm_vcpu_wake_up(vcpu);
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
+{
+ struct arch_timer_context *ctx;
+ struct kvm_vcpu *vcpu;
+ u64 ns;
+
+ ctx = container_of(hrt, struct arch_timer_context, hrtimer);
+ vcpu = ctx->vcpu;
+
+ trace_kvm_timer_hrtimer_expire(ctx);
+
+ /*
+ * Check that the timer has really expired from the guest's
+ * PoV (NTP on the host may have forced it to expire
+ * early). If not ready, schedule for a later time.
+ */
+ ns = kvm_timer_compute_delta(ctx);
+ if (unlikely(ns)) {
+ hrtimer_forward_now(hrt, ns_to_ktime(ns));
+ return HRTIMER_RESTART;
+ }
+
+ kvm_timer_update_irq(vcpu, true, ctx);
+ return HRTIMER_NORESTART;
+}
+
+static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
+{
+ enum kvm_arch_timers index;
+ u64 cval, now;
+
+ if (!timer_ctx)
+ return false;
+
+ index = arch_timer_ctx_index(timer_ctx);
+
+ if (timer_ctx->loaded) {
+ u32 cnt_ctl = 0;
+
+ switch (index) {
+ case TIMER_VTIMER:
+ cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+ break;
+ case TIMER_PTIMER:
+ cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+ break;
+ case NR_KVM_TIMERS:
+ /* GCC is braindead */
+ cnt_ctl = 0;
+ break;
+ }
+
+ return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
+ (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
+ !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
+ }
+
+ if (!kvm_timer_irq_can_fire(timer_ctx))
+ return false;
+
+ cval = timer_ctx->cnt_cval;
+ now = kvm_phys_timer_read() - timer_ctx->cntvoff;
+
+ return cval <= now;
+}
+
+bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
+{
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
+
+ return kvm_timer_should_fire(map.direct_vtimer) ||
+ kvm_timer_should_fire(map.direct_ptimer) ||
+ kvm_timer_should_fire(map.emul_ptimer);
+}
+
+/*
+ * Reflect the timer output level into the kvm_run structure
+ */
+void kvm_timer_update_run(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+ /* Populate the device bitmap with the timer states */
+ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
+ KVM_ARM_DEV_EL1_PTIMER);
+ if (kvm_timer_should_fire(vtimer))
+ regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
+ if (kvm_timer_should_fire(ptimer))
+ regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
+}
+
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
+ struct arch_timer_context *timer_ctx)
+{
+ int ret;
+
+ timer_ctx->irq.level = new_level;
+ trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq,
+ timer_ctx->irq.level);
+
+ if (!userspace_irqchip(vcpu->kvm)) {
+ ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+ timer_ctx->irq.irq,
+ timer_ctx->irq.level,
+ timer_ctx);
+ WARN_ON(ret);
+ }
+}
+
+/* Only called for a fully emulated timer */
+static void timer_emulate(struct arch_timer_context *ctx)
+{
+ bool should_fire = kvm_timer_should_fire(ctx);
+
+ trace_kvm_timer_emulate(ctx, should_fire);
+
+ if (should_fire != ctx->irq.level) {
+ kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
+ return;
+ }
+
+ /*
+ * If the timer can fire now, we don't need to have a soft timer
+ * scheduled for the future. If the timer cannot fire at all,
+ * then we also don't need a soft timer.
+ */
+ if (!kvm_timer_irq_can_fire(ctx)) {
+ soft_timer_cancel(&ctx->hrtimer);
+ return;
+ }
+
+ soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
+}
+
+static void timer_save_state(struct arch_timer_context *ctx)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+ unsigned long flags;
+
+ if (!timer->enabled)
+ return;
+
+ local_irq_save(flags);
+
+ if (!ctx->loaded)
+ goto out;
+
+ switch (index) {
+ case TIMER_VTIMER:
+ ctx->cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
+ ctx->cnt_cval = read_sysreg_el0(SYS_CNTV_CVAL);
+
+ /* Disable the timer */
+ write_sysreg_el0(0, SYS_CNTV_CTL);
+ isb();
+
+ break;
+ case TIMER_PTIMER:
+ ctx->cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
+ ctx->cnt_cval = read_sysreg_el0(SYS_CNTP_CVAL);
+
+ /* Disable the timer */
+ write_sysreg_el0(0, SYS_CNTP_CTL);
+ isb();
+
+ break;
+ case NR_KVM_TIMERS:
+ BUG();
+ }
+
+ trace_kvm_timer_save_state(ctx);
+
+ ctx->loaded = false;
+out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Schedule the background timer before calling kvm_vcpu_block, so that this
+ * thread is removed from its waitqueue and made runnable when there's a timer
+ * interrupt to handle.
+ */
+static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
+
+ /*
+ * If no timers are capable of raising interrupts (disabled or
+ * masked), then there's no more work for us to do.
+ */
+ if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
+ !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+ !kvm_timer_irq_can_fire(map.emul_ptimer))
+ return;
+
+ /*
+ * At least one guest time will expire. Schedule a background timer.
+ * Set the earliest expiration time among the guest timers.
+ */
+ soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
+}
+
+static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+ soft_timer_cancel(&timer->bg_timer);
+}
+
+static void timer_restore_state(struct arch_timer_context *ctx)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+ unsigned long flags;
+
+ if (!timer->enabled)
+ return;
+
+ local_irq_save(flags);
+
+ if (ctx->loaded)
+ goto out;
+
+ switch (index) {
+ case TIMER_VTIMER:
+ write_sysreg_el0(ctx->cnt_cval, SYS_CNTV_CVAL);
+ isb();
+ write_sysreg_el0(ctx->cnt_ctl, SYS_CNTV_CTL);
+ break;
+ case TIMER_PTIMER:
+ write_sysreg_el0(ctx->cnt_cval, SYS_CNTP_CVAL);
+ isb();
+ write_sysreg_el0(ctx->cnt_ctl, SYS_CNTP_CTL);
+ break;
+ case NR_KVM_TIMERS:
+ BUG();
+ }
+
+ trace_kvm_timer_restore_state(ctx);
+
+ ctx->loaded = true;
+out:
+ local_irq_restore(flags);
+}
+
+static void set_cntvoff(u64 cntvoff)
+{
+ kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
+}
+
+static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
+{
+ int r;
+ r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
+ WARN_ON(r);
+}
+
+static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
+{
+ struct kvm_vcpu *vcpu = ctx->vcpu;
+ bool phys_active = false;
+
+ /*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
+
+ if (irqchip_in_kernel(vcpu->kvm))
+ phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+
+ phys_active |= ctx->irq.level;
+
+ set_timer_irq_phys_active(ctx, phys_active);
+}
+
+static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+ /*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+ /*
+ * When using a userspace irqchip with the architected timers and a
+ * host interrupt controller that doesn't support an active state, we
+ * must still prevent continuously exiting from the guest, and
+ * therefore mask the physical interrupt by disabling it on the host
+ * interrupt controller when the virtual level is high, such that the
+ * guest can make forward progress. Once we detect the output level
+ * being de-asserted, we unmask the interrupt again so that we exit
+ * from the guest when the timer fires.
+ */
+ if (vtimer->irq.level)
+ disable_percpu_irq(host_vtimer_irq);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+}
+
+void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ get_timer_map(vcpu, &map);
+
+ if (static_branch_likely(&has_gic_active_state)) {
+ kvm_timer_vcpu_load_gic(map.direct_vtimer);
+ if (map.direct_ptimer)
+ kvm_timer_vcpu_load_gic(map.direct_ptimer);
+ } else {
+ kvm_timer_vcpu_load_nogic(vcpu);
+ }
+
+ set_cntvoff(map.direct_vtimer->cntvoff);
+
+ kvm_timer_unblocking(vcpu);
+
+ timer_restore_state(map.direct_vtimer);
+ if (map.direct_ptimer)
+ timer_restore_state(map.direct_ptimer);
+
+ if (map.emul_ptimer)
+ timer_emulate(map.emul_ptimer);
+}
+
+bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+ bool vlevel, plevel;
+
+ if (likely(irqchip_in_kernel(vcpu->kvm)))
+ return false;
+
+ vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
+ plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;
+
+ return kvm_timer_should_fire(vtimer) != vlevel ||
+ kvm_timer_should_fire(ptimer) != plevel;
+}
+
+void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+ struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ get_timer_map(vcpu, &map);
+
+ timer_save_state(map.direct_vtimer);
+ if (map.direct_ptimer)
+ timer_save_state(map.direct_ptimer);
+
+ /*
+ * Cancel soft timer emulation, because the only case where we
+ * need it after a vcpu_put is in the context of a sleeping VCPU, and
+ * in that case we already factor in the deadline for the physical
+ * timer when scheduling the bg_timer.
+ *
+ * In any case, we re-schedule the hrtimer for the physical timer when
+ * coming back to the VCPU thread in kvm_timer_vcpu_load().
+ */
+ if (map.emul_ptimer)
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+ if (rcuwait_active(wait))
+ kvm_timer_blocking(vcpu);
+
+ /*
+ * The kernel may decide to run userspace after calling vcpu_put, so
+ * we reset cntvoff to 0 to ensure a consistent read between user
+ * accesses to the virtual counter and kernel access to the physical
+ * counter of non-VHE case. For VHE, the virtual counter uses a fixed
+ * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
+ */
+ set_cntvoff(0);
+}
+
+/*
+ * With a userspace irqchip we have to check if the guest de-asserted the
+ * timer and if so, unmask the timer irq signal on the host interrupt
+ * controller to ensure that we see future timer signals.
+ */
+static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+
+ if (!kvm_timer_should_fire(vtimer)) {
+ kvm_timer_update_irq(vcpu, false, vtimer);
+ if (static_branch_likely(&has_gic_active_state))
+ set_timer_irq_phys_active(vtimer, false);
+ else
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+ }
+}
+
+void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+ if (unlikely(!timer->enabled))
+ return;
+
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ unmask_vtimer_irq_user(vcpu);
+}
+
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
+
+ /*
+ * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
+ * and to 0 for ARMv7. We provide an implementation that always
+ * resets the timer to be disabled and unmasked and is compliant with
+ * the ARMv7 architecture.
+ */
+ vcpu_vtimer(vcpu)->cnt_ctl = 0;
+ vcpu_ptimer(vcpu)->cnt_ctl = 0;
+
+ if (timer->enabled) {
+ kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
+ kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+ if (map.direct_ptimer)
+ kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+ }
+ }
+
+ if (map.emul_ptimer)
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+ return 0;
+}
+
+/* Make the updates of cntvoff for all vtimer contexts atomic */
+static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
+{
+ int i;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *tmp;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, tmp, kvm)
+ vcpu_vtimer(tmp)->cntvoff = cntvoff;
+
+ /*
+ * When called from the vcpu create path, the CPU being created is not
+ * included in the loop above, so we just set it here as well.
+ */
+ vcpu_vtimer(vcpu)->cntvoff = cntvoff;
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+
+ /* Synchronize cntvoff across all vtimers of a VM. */
+ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
+ ptimer->cntvoff = 0;
+
+ hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ timer->bg_timer.function = kvm_bg_timer_expire;
+
+ hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ vtimer->hrtimer.function = kvm_hrtimer_expire;
+ ptimer->hrtimer.function = kvm_hrtimer_expire;
+
+ vtimer->irq.irq = default_vtimer_irq.irq;
+ ptimer->irq.irq = default_ptimer_irq.irq;
+
+ vtimer->host_timer_irq = host_vtimer_irq;
+ ptimer->host_timer_irq = host_ptimer_irq;
+
+ vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
+ ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+
+ vtimer->vcpu = vcpu;
+ ptimer->vcpu = vcpu;
+}
+
+static void kvm_timer_init_interrupt(void *info)
+{
+ enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+ enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+}
+
+int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
+{
+ struct arch_timer_context *timer;
+
+ switch (regid) {
+ case KVM_REG_ARM_TIMER_CTL:
+ timer = vcpu_vtimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+ break;
+ case KVM_REG_ARM_TIMER_CNT:
+ timer = vcpu_vtimer(vcpu);
+ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
+ break;
+ case KVM_REG_ARM_TIMER_CVAL:
+ timer = vcpu_vtimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+ break;
+ case KVM_REG_ARM_PTIMER_CTL:
+ timer = vcpu_ptimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
+ break;
+ case KVM_REG_ARM_PTIMER_CVAL:
+ timer = vcpu_ptimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
+ break;
+
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+static u64 read_timer_ctl(struct arch_timer_context *timer)
+{
+ /*
+ * Set ISTATUS bit if it's expired.
+ * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
+ * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
+ * regardless of ENABLE bit for our implementation convenience.
+ */
+ if (!kvm_timer_compute_delta(timer))
+ return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT;
+ else
+ return timer->cnt_ctl;
+}
+
+u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
+{
+ switch (regid) {
+ case KVM_REG_ARM_TIMER_CTL:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CTL);
+ case KVM_REG_ARM_TIMER_CNT:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CNT);
+ case KVM_REG_ARM_TIMER_CVAL:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CVAL);
+ case KVM_REG_ARM_PTIMER_CTL:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_ptimer(vcpu), TIMER_REG_CTL);
+ case KVM_REG_ARM_PTIMER_CNT:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_ptimer(vcpu), TIMER_REG_CNT);
+ case KVM_REG_ARM_PTIMER_CVAL:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_ptimer(vcpu), TIMER_REG_CVAL);
+ }
+ return (u64)-1;
+}
+
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg)
+{
+ u64 val;
+
+ switch (treg) {
+ case TIMER_REG_TVAL:
+ val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
+ val &= lower_32_bits(val);
+ break;
+
+ case TIMER_REG_CTL:
+ val = read_timer_ctl(timer);
+ break;
+
+ case TIMER_REG_CVAL:
+ val = timer->cnt_cval;
+ break;
+
+ case TIMER_REG_CNT:
+ val = kvm_phys_timer_read() - timer->cntvoff;
+ break;
+
+ default:
+ BUG();
+ }
+
+ return val;
+}
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+ enum kvm_arch_timers tmr,
+ enum kvm_arch_timer_regs treg)
+{
+ u64 val;
+
+ preempt_disable();
+ kvm_timer_vcpu_put(vcpu);
+
+ val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+
+ kvm_timer_vcpu_load(vcpu);
+ preempt_enable();
+
+ return val;
+}
+
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg,
+ u64 val)
+{
+ switch (treg) {
+ case TIMER_REG_TVAL:
+ timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
+ break;
+
+ case TIMER_REG_CTL:
+ timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
+ break;
+
+ case TIMER_REG_CVAL:
+ timer->cnt_cval = val;
+ break;
+
+ default:
+ BUG();
+ }
+}
+
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+ enum kvm_arch_timers tmr,
+ enum kvm_arch_timer_regs treg,
+ u64 val)
+{
+ preempt_disable();
+ kvm_timer_vcpu_put(vcpu);
+
+ kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
+
+ kvm_timer_vcpu_load(vcpu);
+ preempt_enable();
+}
+
+static int kvm_timer_starting_cpu(unsigned int cpu)
+{
+ kvm_timer_init_interrupt(NULL);
+ return 0;
+}
+
+static int kvm_timer_dying_cpu(unsigned int cpu)
+{
+ disable_percpu_irq(host_vtimer_irq);
+ return 0;
+}
+
+int kvm_timer_hyp_init(bool has_gic)
+{
+ struct arch_timer_kvm_info *info;
+ int err;
+
+ info = arch_timer_get_kvm_info();
+ timecounter = &info->timecounter;
+
+ if (!timecounter->cc) {
+ kvm_err("kvm_arch_timer: uninitialized timecounter\n");
+ return -ENODEV;
+ }
+
+ /* First, do the virtual EL1 timer irq */
+
+ if (info->virtual_irq <= 0) {
+ kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
+ info->virtual_irq);
+ return -ENODEV;
+ }
+ host_vtimer_irq = info->virtual_irq;
+
+ host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
+ if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
+ host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
+ kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
+ host_vtimer_irq);
+ host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
+ }
+
+ err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
+ "kvm guest vtimer", kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
+ host_vtimer_irq, err);
+ return err;
+ }
+
+ if (has_gic) {
+ err = irq_set_vcpu_affinity(host_vtimer_irq,
+ kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+ goto out_free_irq;
+ }
+
+ static_branch_enable(&has_gic_active_state);
+ }
+
+ kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
+
+ /* Now let's do the physical EL1 timer irq */
+
+ if (info->physical_irq > 0) {
+ host_ptimer_irq = info->physical_irq;
+ host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
+ if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
+ host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
+ kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
+ host_ptimer_irq);
+ host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
+ }
+
+ err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
+ "kvm guest ptimer", kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
+ host_ptimer_irq, err);
+ return err;
+ }
+
+ if (has_gic) {
+ err = irq_set_vcpu_affinity(host_ptimer_irq,
+ kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+ goto out_free_irq;
+ }
+ }
+
+ kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
+ } else if (has_vhe()) {
+ kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
+ info->physical_irq);
+ err = -ENODEV;
+ goto out_free_irq;
+ }
+
+ cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
+ "kvm/arm/timer:starting", kvm_timer_starting_cpu,
+ kvm_timer_dying_cpu);
+ return 0;
+out_free_irq:
+ free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
+ return err;
+}
+
+void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+ soft_timer_cancel(&timer->bg_timer);
+}
+
+static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
+{
+ int vtimer_irq, ptimer_irq;
+ int i, ret;
+
+ vtimer_irq = vcpu_vtimer(vcpu)->irq.irq;
+ ret = kvm_vgic_set_owner(vcpu, vtimer_irq, vcpu_vtimer(vcpu));
+ if (ret)
+ return false;
+
+ ptimer_irq = vcpu_ptimer(vcpu)->irq.irq;
+ ret = kvm_vgic_set_owner(vcpu, ptimer_irq, vcpu_ptimer(vcpu));
+ if (ret)
+ return false;
+
+ kvm_for_each_vcpu(i, vcpu, vcpu->kvm) {
+ if (vcpu_vtimer(vcpu)->irq.irq != vtimer_irq ||
+ vcpu_ptimer(vcpu)->irq.irq != ptimer_irq)
+ return false;
+ }
+
+ return true;
+}
+
+bool kvm_arch_timer_get_input_level(int vintid)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+ struct arch_timer_context *timer;
+
+ if (vintid == vcpu_vtimer(vcpu)->irq.irq)
+ timer = vcpu_vtimer(vcpu);
+ else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+ timer = vcpu_ptimer(vcpu);
+ else
+ BUG();
+
+ return kvm_timer_should_fire(timer);
+}
+
+int kvm_timer_enable(struct kvm_vcpu *vcpu)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+ int ret;
+
+ if (timer->enabled)
+ return 0;
+
+ /* Without a VGIC we do not map virtual IRQs to physical IRQs */
+ if (!irqchip_in_kernel(vcpu->kvm))
+ goto no_vgic;
+
+ if (!vgic_initialized(vcpu->kvm))
+ return -ENODEV;
+
+ if (!timer_irqs_are_valid(vcpu)) {
+ kvm_debug("incorrectly configured timer irqs\n");
+ return -EINVAL;
+ }
+
+ get_timer_map(vcpu, &map);
+
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map.direct_vtimer->host_timer_irq,
+ map.direct_vtimer->irq.irq,
+ kvm_arch_timer_get_input_level);
+ if (ret)
+ return ret;
+
+ if (map.direct_ptimer) {
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map.direct_ptimer->host_timer_irq,
+ map.direct_ptimer->irq.irq,
+ kvm_arch_timer_get_input_level);
+ }
+
+ if (ret)
+ return ret;
+
+no_vgic:
+ timer->enabled = 1;
+ return 0;
+}
+
+/*
+ * On VHE system, we only need to configure the EL2 timer trap register once,
+ * not for every world switch.
+ * The host kernel runs at EL2 with HCR_EL2.TGE == 1,
+ * and this makes those bits have no effect for the host kernel execution.
+ */
+void kvm_timer_init_vhe(void)
+{
+ /* When HCR_EL2.E2H ==1, EL1PCEN and EL1PCTEN are shifted by 10 */
+ u32 cnthctl_shift = 10;
+ u64 val;
+
+ /*
+ * VHE systems allow the guest direct access to the EL1 physical
+ * timer/counter.
+ */
+ val = read_sysreg(cnthctl_el2);
+ val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
+ val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
+ write_sysreg(val, cnthctl_el2);
+}
+
+static void set_timer_irqs(struct kvm *kvm, int vtimer_irq, int ptimer_irq)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu_vtimer(vcpu)->irq.irq = vtimer_irq;
+ vcpu_ptimer(vcpu)->irq.irq = ptimer_irq;
+ }
+}
+
+int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ int __user *uaddr = (int __user *)(long)attr->addr;
+ struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ int irq;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+
+ if (get_user(irq, uaddr))
+ return -EFAULT;
+
+ if (!(irq_is_ppi(irq)))
+ return -EINVAL;
+
+ if (vcpu->arch.timer_cpu.enabled)
+ return -EBUSY;
+
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+ set_timer_irqs(vcpu->kvm, irq, ptimer->irq.irq);
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+ set_timer_irqs(vcpu->kvm, vtimer->irq.irq, irq);
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ int __user *uaddr = (int __user *)(long)attr->addr;
+ struct arch_timer_context *timer;
+ int irq;
+
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+ timer = vcpu_vtimer(vcpu);
+ break;
+ case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+ timer = vcpu_ptimer(vcpu);
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ irq = timer->irq.irq;
+ return put_user(irq, uaddr);
+}
+
+int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
+ case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
+ return 0;
+ }
+
+ return -ENXIO;
+}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
new file mode 100644
index 000000000000..7a57381c05e8
--- /dev/null
+++ b/arch/arm64/kvm/arm.c
@@ -0,0 +1,1710 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/bug.h>
+#include <linux/cpu_pm.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/sched/stat.h>
+#include <trace/events/kvm.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_arm.h"
+
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/mman.h>
+#include <asm/tlbflush.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/virt.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_coproc.h>
+#include <asm/sections.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_psci.h>
+
+#ifdef REQUIRES_VIRT
+__asm__(".arch_extension virt");
+#endif
+
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+
+/* The VMID used in the VTTBR */
+static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
+static u32 kvm_next_vmid;
+static DEFINE_SPINLOCK(kvm_vmid_lock);
+
+static bool vgic_present;
+
+static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
+DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
+}
+
+int kvm_arch_hardware_setup(void *opaque)
+{
+ return 0;
+}
+
+int kvm_arch_check_processor_compat(void *opaque)
+{
+ return 0;
+}
+
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_ARM_NISV_TO_USER:
+ r = 0;
+ kvm->arch.return_nisv_io_abort_to_user = true;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_arm_default_max_vcpus(void)
+{
+ return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
+}
+
+/**
+ * kvm_arch_init_vm - initializes a VM data structure
+ * @kvm: pointer to the KVM struct
+ */
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+ int ret, cpu;
+
+ ret = kvm_arm_setup_stage2(kvm, type);
+ if (ret)
+ return ret;
+
+ kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran));
+ if (!kvm->arch.last_vcpu_ran)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ *per_cpu_ptr(kvm->arch.last_vcpu_ran, cpu) = -1;
+
+ ret = kvm_alloc_stage2_pgd(kvm);
+ if (ret)
+ goto out_fail_alloc;
+
+ ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
+ if (ret)
+ goto out_free_stage2_pgd;
+
+ kvm_vgic_early_init(kvm);
+
+ /* Mark the initial VMID generation invalid */
+ kvm->arch.vmid.vmid_gen = 0;
+
+ /* The maximum number of VCPUs is limited by the host's GIC model */
+ kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
+
+ return ret;
+out_free_stage2_pgd:
+ kvm_free_stage2_pgd(kvm);
+out_fail_alloc:
+ free_percpu(kvm->arch.last_vcpu_ran);
+ kvm->arch.last_vcpu_ran = NULL;
+ return ret;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+
+/**
+ * kvm_arch_destroy_vm - destroy the VM data structure
+ * @kvm: pointer to the KVM struct
+ */
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+ int i;
+
+ kvm_vgic_destroy(kvm);
+
+ free_percpu(kvm->arch.last_vcpu_ran);
+ kvm->arch.last_vcpu_ran = NULL;
+
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ if (kvm->vcpus[i]) {
+ kvm_vcpu_destroy(kvm->vcpus[i]);
+ kvm->vcpus[i] = NULL;
+ }
+ }
+ atomic_set(&kvm->online_vcpus, 0);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+ int r;
+ switch (ext) {
+ case KVM_CAP_IRQCHIP:
+ r = vgic_present;
+ break;
+ case KVM_CAP_IOEVENTFD:
+ case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_USER_MEMORY:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+ case KVM_CAP_ONE_REG:
+ case KVM_CAP_ARM_PSCI:
+ case KVM_CAP_ARM_PSCI_0_2:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
+ case KVM_CAP_ARM_NISV_TO_USER:
+ case KVM_CAP_ARM_INJECT_EXT_DABT:
+ r = 1;
+ break;
+ case KVM_CAP_ARM_SET_DEVICE_ADDR:
+ r = 1;
+ break;
+ case KVM_CAP_NR_VCPUS:
+ r = num_online_cpus();
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ case KVM_CAP_MAX_VCPU_ID:
+ if (kvm)
+ r = kvm->arch.max_vcpus;
+ else
+ r = kvm_arm_default_max_vcpus();
+ break;
+ case KVM_CAP_MSI_DEVID:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm->arch.vgic.msis_require_devid;
+ break;
+ case KVM_CAP_ARM_USER_IRQ:
+ /*
+ * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
+ * (bump this number if adding more devices)
+ */
+ r = 1;
+ break;
+ default:
+ r = kvm_arch_vm_ioctl_check_extension(kvm, ext);
+ break;
+ }
+ return r;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ return -EINVAL;
+}
+
+struct kvm *kvm_arch_alloc_vm(void)
+{
+ if (!has_vhe())
+ return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+ return vzalloc(sizeof(struct kvm));
+}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ if (!has_vhe())
+ kfree(kvm);
+ else
+ vfree(kvm);
+}
+
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
+{
+ if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+ return -EBUSY;
+
+ if (id >= kvm->arch.max_vcpus)
+ return -EINVAL;
+
+ return 0;
+}
+
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ /* Force users to call KVM_ARM_VCPU_INIT */
+ vcpu->arch.target = -1;
+ bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+
+ /* Set up the timer */
+ kvm_timer_vcpu_init(vcpu);
+
+ kvm_pmu_vcpu_init(vcpu);
+
+ kvm_arm_reset_debug_ptr(vcpu);
+
+ kvm_arm_pvtime_vcpu_init(&vcpu->arch);
+
+ err = kvm_vgic_vcpu_init(vcpu);
+ if (err)
+ return err;
+
+ return create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ static_branch_dec(&userspace_irqchip_in_use);
+
+ kvm_mmu_free_memory_caches(vcpu);
+ kvm_timer_vcpu_terminate(vcpu);
+ kvm_pmu_vcpu_destroy(vcpu);
+
+ kvm_arm_vcpu_destroy(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return kvm_timer_is_pending(vcpu);
+}
+
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If we're about to block (most likely because we've just hit a
+ * WFI), we need to sync back the state of the GIC CPU interface
+ * so that we have the latest PMR and group enables. This ensures
+ * that kvm_arch_vcpu_runnable has up-to-date data to decide
+ * whether we have pending interrupts.
+ *
+ * For the same reason, we want to tell GICv4 that we need
+ * doorbells to be signalled, should an interrupt become pending.
+ */
+ preempt_disable();
+ kvm_vgic_vmcr_sync(vcpu);
+ vgic_v4_put(vcpu, true);
+ preempt_enable();
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ vgic_v4_load(vcpu);
+ preempt_enable();
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ int *last_ran;
+ kvm_host_data_t *cpu_data;
+
+ last_ran = this_cpu_ptr(vcpu->kvm->arch.last_vcpu_ran);
+ cpu_data = this_cpu_ptr(&kvm_host_data);
+
+ /*
+ * We might get preempted before the vCPU actually runs, but
+ * over-invalidation doesn't affect correctness.
+ */
+ if (*last_ran != vcpu->vcpu_id) {
+ kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
+ *last_ran = vcpu->vcpu_id;
+ }
+
+ vcpu->cpu = cpu;
+ vcpu->arch.host_cpu_context = &cpu_data->host_ctxt;
+
+ kvm_vgic_load(vcpu);
+ kvm_timer_vcpu_load(vcpu);
+ kvm_vcpu_load_sysregs(vcpu);
+ kvm_arch_vcpu_load_fp(vcpu);
+ kvm_vcpu_pmu_restore_guest(vcpu);
+ if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
+ kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
+
+ if (single_task_running())
+ vcpu_clear_wfx_traps(vcpu);
+ else
+ vcpu_set_wfx_traps(vcpu);
+
+ vcpu_ptrauth_setup_lazy(vcpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ kvm_arch_vcpu_put_fp(vcpu);
+ kvm_vcpu_put_sysregs(vcpu);
+ kvm_timer_vcpu_put(vcpu);
+ kvm_vgic_put(vcpu);
+ kvm_vcpu_pmu_restore_host(vcpu);
+
+ vcpu->cpu = -1;
+}
+
+static void vcpu_power_off(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.power_off = true;
+ kvm_make_request(KVM_REQ_SLEEP, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ if (vcpu->arch.power_off)
+ mp_state->mp_state = KVM_MP_STATE_STOPPED;
+ else
+ mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ int ret = 0;
+
+ switch (mp_state->mp_state) {
+ case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.power_off = false;
+ break;
+ case KVM_MP_STATE_STOPPED:
+ vcpu_power_off(vcpu);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+/**
+ * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
+ * @v: The VCPU pointer
+ *
+ * If the guest CPU is not waiting for interrupts or an interrupt line is
+ * asserted, the CPU is by definition runnable.
+ */
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+ bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
+ return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
+ && !v->arch.power_off && !v->arch.pause);
+}
+
+bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu_mode_priv(vcpu);
+}
+
+/* Just ensure a guest exit from a particular CPU */
+static void exit_vm_noop(void *info)
+{
+}
+
+void force_vm_exit(const cpumask_t *mask)
+{
+ preempt_disable();
+ smp_call_function_many(mask, exit_vm_noop, NULL, true);
+ preempt_enable();
+}
+
+/**
+ * need_new_vmid_gen - check that the VMID is still valid
+ * @vmid: The VMID to check
+ *
+ * return true if there is a new generation of VMIDs being used
+ *
+ * The hardware supports a limited set of values with the value zero reserved
+ * for the host, so we check if an assigned value belongs to a previous
+ * generation, which requires us to assign a new value. If we're the first to
+ * use a VMID for the new generation, we must flush necessary caches and TLBs
+ * on all CPUs.
+ */
+static bool need_new_vmid_gen(struct kvm_vmid *vmid)
+{
+ u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
+ smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
+ return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
+}
+
+/**
+ * update_vmid - Update the vmid with a valid VMID for the current generation
+ * @kvm: The guest that struct vmid belongs to
+ * @vmid: The stage-2 VMID information struct
+ */
+static void update_vmid(struct kvm_vmid *vmid)
+{
+ if (!need_new_vmid_gen(vmid))
+ return;
+
+ spin_lock(&kvm_vmid_lock);
+
+ /*
+ * We need to re-check the vmid_gen here to ensure that if another vcpu
+ * already allocated a valid vmid for this vm, then this vcpu should
+ * use the same vmid.
+ */
+ if (!need_new_vmid_gen(vmid)) {
+ spin_unlock(&kvm_vmid_lock);
+ return;
+ }
+
+ /* First user of a new VMID generation? */
+ if (unlikely(kvm_next_vmid == 0)) {
+ atomic64_inc(&kvm_vmid_gen);
+ kvm_next_vmid = 1;
+
+ /*
+ * On SMP we know no other CPUs can use this CPU's or each
+ * other's VMID after force_vm_exit returns since the
+ * kvm_vmid_lock blocks them from reentry to the guest.
+ */
+ force_vm_exit(cpu_all_mask);
+ /*
+ * Now broadcast TLB + ICACHE invalidation over the inner
+ * shareable domain to make sure all data structures are
+ * clean.
+ */
+ kvm_call_hyp(__kvm_flush_vm_context);
+ }
+
+ vmid->vmid = kvm_next_vmid;
+ kvm_next_vmid++;
+ kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
+
+ smp_wmb();
+ WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
+
+ spin_unlock(&kvm_vmid_lock);
+}
+
+static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret = 0;
+
+ if (likely(vcpu->arch.has_run_once))
+ return 0;
+
+ if (!kvm_arm_vcpu_is_finalized(vcpu))
+ return -EPERM;
+
+ vcpu->arch.has_run_once = true;
+
+ if (likely(irqchip_in_kernel(kvm))) {
+ /*
+ * Map the VGIC hardware resources before running a vcpu the
+ * first time on this VM.
+ */
+ if (unlikely(!vgic_ready(kvm))) {
+ ret = kvm_vgic_map_resources(kvm);
+ if (ret)
+ return ret;
+ }
+ } else {
+ /*
+ * Tell the rest of the code that there are userspace irqchip
+ * VMs in the wild.
+ */
+ static_branch_inc(&userspace_irqchip_in_use);
+ }
+
+ ret = kvm_timer_enable(vcpu);
+ if (ret)
+ return ret;
+
+ ret = kvm_arm_pmu_v3_enable(vcpu);
+
+ return ret;
+}
+
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+ return vgic_initialized(kvm);
+}
+
+void kvm_arm_halt_guest(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ vcpu->arch.pause = true;
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
+}
+
+void kvm_arm_resume_guest(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ vcpu->arch.pause = false;
+ rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
+ }
+}
+
+static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
+{
+ struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
+
+ rcuwait_wait_event(wait,
+ (!vcpu->arch.power_off) &&(!vcpu->arch.pause),
+ TASK_INTERRUPTIBLE);
+
+ if (vcpu->arch.power_off || vcpu->arch.pause) {
+ /* Awaken to handle a signal, request we sleep again later. */
+ kvm_make_request(KVM_REQ_SLEEP, vcpu);
+ }
+
+ /*
+ * Make sure we will observe a potential reset request if we've
+ * observed a change to the power state. Pairs with the smp_wmb() in
+ * kvm_psci_vcpu_on().
+ */
+ smp_rmb();
+}
+
+static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.target >= 0;
+}
+
+static void check_vcpu_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
+ vcpu_req_sleep(vcpu);
+
+ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+ kvm_reset_vcpu(vcpu);
+
+ /*
+ * Clear IRQ_PENDING requests that were made to guarantee
+ * that a VCPU sees new virtual interrupts.
+ */
+ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+
+ if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
+ kvm_update_stolen_time(vcpu);
+
+ if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
+ /* The distributor enable bits were changed */
+ preempt_disable();
+ vgic_v4_put(vcpu, false);
+ vgic_v4_load(vcpu);
+ preempt_enable();
+ }
+ }
+}
+
+/**
+ * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
+ * @vcpu: The VCPU pointer
+ *
+ * This function is called through the VCPU_RUN ioctl called from user space. It
+ * will execute VM code in a loop until the time slice for the process is used
+ * or some emulation is needed from user space in which case the function will
+ * return with return value 0 and with the kvm_run structure filled in with the
+ * required data for the requested emulation.
+ */
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+ int ret;
+
+ if (unlikely(!kvm_vcpu_initialized(vcpu)))
+ return -ENOEXEC;
+
+ ret = kvm_vcpu_first_run_init(vcpu);
+ if (ret)
+ return ret;
+
+ if (run->exit_reason == KVM_EXIT_MMIO) {
+ ret = kvm_handle_mmio_return(vcpu, run);
+ if (ret)
+ return ret;
+ }
+
+ if (run->immediate_exit)
+ return -EINTR;
+
+ vcpu_load(vcpu);
+
+ kvm_sigset_activate(vcpu);
+
+ ret = 1;
+ run->exit_reason = KVM_EXIT_UNKNOWN;
+ while (ret > 0) {
+ /*
+ * Check conditions before entering the guest
+ */
+ cond_resched();
+
+ update_vmid(&vcpu->kvm->arch.vmid);
+
+ check_vcpu_requests(vcpu);
+
+ /*
+ * Preparing the interrupts to be injected also
+ * involves poking the GIC, which must be done in a
+ * non-preemptible context.
+ */
+ preempt_disable();
+
+ kvm_pmu_flush_hwstate(vcpu);
+
+ local_irq_disable();
+
+ kvm_vgic_flush_hwstate(vcpu);
+
+ /*
+ * Exit if we have a signal pending so that we can deliver the
+ * signal to user space.
+ */
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ run->exit_reason = KVM_EXIT_INTR;
+ }
+
+ /*
+ * If we're using a userspace irqchip, then check if we need
+ * to tell a userspace irqchip about timer or PMU level
+ * changes and if so, exit to userspace (the actual level
+ * state gets updated in kvm_timer_update_run and
+ * kvm_pmu_update_run below).
+ */
+ if (static_branch_unlikely(&userspace_irqchip_in_use)) {
+ if (kvm_timer_should_notify_user(vcpu) ||
+ kvm_pmu_should_notify_user(vcpu)) {
+ ret = -EINTR;
+ run->exit_reason = KVM_EXIT_INTR;
+ }
+ }
+
+ /*
+ * Ensure we set mode to IN_GUEST_MODE after we disable
+ * interrupts and before the final VCPU requests check.
+ * See the comment in kvm_vcpu_exiting_guest_mode() and
+ * Documentation/virt/kvm/vcpu-requests.rst
+ */
+ smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
+ if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
+ kvm_request_pending(vcpu)) {
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ isb(); /* Ensure work in x_flush_hwstate is committed */
+ kvm_pmu_sync_hwstate(vcpu);
+ if (static_branch_unlikely(&userspace_irqchip_in_use))
+ kvm_timer_sync_hwstate(vcpu);
+ kvm_vgic_sync_hwstate(vcpu);
+ local_irq_enable();
+ preempt_enable();
+ continue;
+ }
+
+ kvm_arm_setup_debug(vcpu);
+
+ /**************************************************************
+ * Enter the guest
+ */
+ trace_kvm_entry(*vcpu_pc(vcpu));
+ guest_enter_irqoff();
+
+ if (has_vhe()) {
+ ret = kvm_vcpu_run_vhe(vcpu);
+ } else {
+ ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
+ }
+
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ vcpu->stat.exits++;
+ /*
+ * Back from guest
+ *************************************************************/
+
+ kvm_arm_clear_debug(vcpu);
+
+ /*
+ * We must sync the PMU state before the vgic state so
+ * that the vgic can properly sample the updated state of the
+ * interrupt line.
+ */
+ kvm_pmu_sync_hwstate(vcpu);
+
+ /*
+ * Sync the vgic state before syncing the timer state because
+ * the timer code needs to know if the virtual timer
+ * interrupts are active.
+ */
+ kvm_vgic_sync_hwstate(vcpu);
+
+ /*
+ * Sync the timer hardware state before enabling interrupts as
+ * we don't want vtimer interrupts to race with syncing the
+ * timer virtual interrupt state.
+ */
+ if (static_branch_unlikely(&userspace_irqchip_in_use))
+ kvm_timer_sync_hwstate(vcpu);
+
+ kvm_arch_vcpu_ctxsync_fp(vcpu);
+
+ /*
+ * We may have taken a host interrupt in HYP mode (ie
+ * while executing the guest). This interrupt is still
+ * pending, as we haven't serviced it yet!
+ *
+ * We're now back in SVC mode, with interrupts
+ * disabled. Enabling the interrupts now will have
+ * the effect of taking the interrupt again, in SVC
+ * mode this time.
+ */
+ local_irq_enable();
+
+ /*
+ * We do local_irq_enable() before calling guest_exit() so
+ * that if a timer interrupt hits while running the guest we
+ * account that tick as being spent in the guest. We enable
+ * preemption after calling guest_exit() so that if we get
+ * preempted we make sure ticks after that is not counted as
+ * guest time.
+ */
+ guest_exit();
+ trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+
+ /* Exit types that need handling before we can be preempted */
+ handle_exit_early(vcpu, run, ret);
+
+ preempt_enable();
+
+ ret = handle_exit(vcpu, run, ret);
+ }
+
+ /* Tell userspace about in-kernel device output levels */
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
+ kvm_timer_update_run(vcpu);
+ kvm_pmu_update_run(vcpu);
+ }
+
+ kvm_sigset_deactivate(vcpu);
+
+ vcpu_put(vcpu);
+ return ret;
+}
+
+static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
+{
+ int bit_index;
+ bool set;
+ unsigned long *hcr;
+
+ if (number == KVM_ARM_IRQ_CPU_IRQ)
+ bit_index = __ffs(HCR_VI);
+ else /* KVM_ARM_IRQ_CPU_FIQ */
+ bit_index = __ffs(HCR_VF);
+
+ hcr = vcpu_hcr(vcpu);
+ if (level)
+ set = test_and_set_bit(bit_index, hcr);
+ else
+ set = test_and_clear_bit(bit_index, hcr);
+
+ /*
+ * If we didn't change anything, no need to wake up or kick other CPUs
+ */
+ if (set == level)
+ return 0;
+
+ /*
+ * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
+ * trigger a world-switch round on the running physical CPU to set the
+ * virtual IRQ/FIQ fields in the HCR appropriately.
+ */
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return 0;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+ bool line_status)
+{
+ u32 irq = irq_level->irq;
+ unsigned int irq_type, vcpu_idx, irq_num;
+ int nrcpus = atomic_read(&kvm->online_vcpus);
+ struct kvm_vcpu *vcpu = NULL;
+ bool level = irq_level->level;
+
+ irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
+ vcpu_idx = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
+ vcpu_idx += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
+ irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;
+
+ trace_kvm_irq_line(irq_type, vcpu_idx, irq_num, irq_level->level);
+
+ switch (irq_type) {
+ case KVM_ARM_IRQ_TYPE_CPU:
+ if (irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ if (vcpu_idx >= nrcpus)
+ return -EINVAL;
+
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ if (!vcpu)
+ return -EINVAL;
+
+ if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
+ return -EINVAL;
+
+ return vcpu_interrupt_line(vcpu, irq_num, level);
+ case KVM_ARM_IRQ_TYPE_PPI:
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ if (vcpu_idx >= nrcpus)
+ return -EINVAL;
+
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ if (!vcpu)
+ return -EINVAL;
+
+ if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
+ return -EINVAL;
+
+ return kvm_vgic_inject_irq(kvm, vcpu->vcpu_id, irq_num, level, NULL);
+ case KVM_ARM_IRQ_TYPE_SPI:
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ if (irq_num < VGIC_NR_PRIVATE_IRQS)
+ return -EINVAL;
+
+ return kvm_vgic_inject_irq(kvm, 0, irq_num, level, NULL);
+ }
+
+ return -EINVAL;
+}
+
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+ const struct kvm_vcpu_init *init)
+{
+ unsigned int i, ret;
+ int phys_target = kvm_target_cpu();
+
+ if (init->target != phys_target)
+ return -EINVAL;
+
+ /*
+ * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+ * use the same target.
+ */
+ if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+ return -EINVAL;
+
+ /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
+ for (i = 0; i < sizeof(init->features) * 8; i++) {
+ bool set = (init->features[i / 32] & (1 << (i % 32)));
+
+ if (set && i >= KVM_VCPU_MAX_FEATURES)
+ return -ENOENT;
+
+ /*
+ * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+ * use the same feature set.
+ */
+ if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
+ test_bit(i, vcpu->arch.features) != set)
+ return -EINVAL;
+
+ if (set)
+ set_bit(i, vcpu->arch.features);
+ }
+
+ vcpu->arch.target = phys_target;
+
+ /* Now we know what it is, we can reset it. */
+ ret = kvm_reset_vcpu(vcpu);
+ if (ret) {
+ vcpu->arch.target = -1;
+ bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ }
+
+ return ret;
+}
+
+static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_init *init)
+{
+ int ret;
+
+ ret = kvm_vcpu_set_target(vcpu, init);
+ if (ret)
+ return ret;
+
+ /*
+ * Ensure a rebooted VM will fault in RAM pages and detect if the
+ * guest MMU is turned off and flush the caches as needed.
+ *
+ * S2FWB enforces all memory accesses to RAM being cacheable, we
+ * ensure that the cache is always coherent.
+ */
+ if (vcpu->arch.has_run_once && !cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+ stage2_unmap_vm(vcpu->kvm);
+
+ vcpu_reset_hcr(vcpu);
+
+ /*
+ * Handle the "start in power-off" case.
+ */
+ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+ vcpu_power_off(vcpu);
+ else
+ vcpu->arch.power_off = false;
+
+ return 0;
+}
+
+static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ default:
+ ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+ break;
+ }
+
+ return ret;
+}
+
+static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ default:
+ ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+ break;
+ }
+
+ return ret;
+}
+
+static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int ret = -ENXIO;
+
+ switch (attr->group) {
+ default:
+ ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+ break;
+ }
+
+ return ret;
+}
+
+static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ memset(events, 0, sizeof(*events));
+
+ return __kvm_arm_vcpu_get_events(vcpu, events);
+}
+
+static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_events *events)
+{
+ int i;
+
+ /* check whether the reserved field is zero */
+ for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
+ if (events->reserved[i])
+ return -EINVAL;
+
+ /* check whether the pad field is zero */
+ for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
+ if (events->exception.pad[i])
+ return -EINVAL;
+
+ return __kvm_arm_vcpu_set_events(vcpu, events);
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+ struct kvm_device_attr attr;
+ long r;
+
+ switch (ioctl) {
+ case KVM_ARM_VCPU_INIT: {
+ struct kvm_vcpu_init init;
+
+ r = -EFAULT;
+ if (copy_from_user(&init, argp, sizeof(init)))
+ break;
+
+ r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
+ break;
+ }
+ case KVM_SET_ONE_REG:
+ case KVM_GET_ONE_REG: {
+ struct kvm_one_reg reg;
+
+ r = -ENOEXEC;
+ if (unlikely(!kvm_vcpu_initialized(vcpu)))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&reg, argp, sizeof(reg)))
+ break;
+
+ if (ioctl == KVM_SET_ONE_REG)
+ r = kvm_arm_set_reg(vcpu, &reg);
+ else
+ r = kvm_arm_get_reg(vcpu, &reg);
+ break;
+ }
+ case KVM_GET_REG_LIST: {
+ struct kvm_reg_list __user *user_list = argp;
+ struct kvm_reg_list reg_list;
+ unsigned n;
+
+ r = -ENOEXEC;
+ if (unlikely(!kvm_vcpu_initialized(vcpu)))
+ break;
+
+ r = -EPERM;
+ if (!kvm_arm_vcpu_is_finalized(vcpu))
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
+ break;
+ n = reg_list.n;
+ reg_list.n = kvm_arm_num_regs(vcpu);
+ if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
+ break;
+ r = -E2BIG;
+ if (n < reg_list.n)
+ break;
+ r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
+ break;
+ }
+ case KVM_SET_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_arm_vcpu_set_attr(vcpu, &attr);
+ break;
+ }
+ case KVM_GET_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_arm_vcpu_get_attr(vcpu, &attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ r = -EFAULT;
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ break;
+ r = kvm_arm_vcpu_has_attr(vcpu, &attr);
+ break;
+ }
+ case KVM_GET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ if (kvm_arm_vcpu_get_events(vcpu, &events))
+ return -EINVAL;
+
+ if (copy_to_user(argp, &events, sizeof(events)))
+ return -EFAULT;
+
+ return 0;
+ }
+ case KVM_SET_VCPU_EVENTS: {
+ struct kvm_vcpu_events events;
+
+ if (copy_from_user(&events, argp, sizeof(events)))
+ return -EFAULT;
+
+ return kvm_arm_vcpu_set_events(vcpu, &events);
+ }
+ case KVM_ARM_VCPU_FINALIZE: {
+ int what;
+
+ if (!kvm_vcpu_initialized(vcpu))
+ return -ENOEXEC;
+
+ if (get_user(what, (const int __user *)argp))
+ return -EFAULT;
+
+ return kvm_arm_vcpu_finalize(vcpu, what);
+ }
+ default:
+ r = -EINVAL;
+ }
+
+ return r;
+}
+
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+
+}
+
+void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
+ struct kvm_arm_device_addr *dev_addr)
+{
+ unsigned long dev_id, type;
+
+ dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >>
+ KVM_ARM_DEVICE_ID_SHIFT;
+ type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >>
+ KVM_ARM_DEVICE_TYPE_SHIFT;
+
+ switch (dev_id) {
+ case KVM_ARM_DEVICE_VGIC_V2:
+ if (!vgic_present)
+ return -ENXIO;
+ return kvm_vgic_addr(kvm, type, &dev_addr->addr, true);
+ default:
+ return -ENODEV;
+ }
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+ unsigned int ioctl, unsigned long arg)
+{
+ struct kvm *kvm = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ switch (ioctl) {
+ case KVM_CREATE_IRQCHIP: {
+ int ret;
+ if (!vgic_present)
+ return -ENXIO;
+ mutex_lock(&kvm->lock);
+ ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+ mutex_unlock(&kvm->lock);
+ return ret;
+ }
+ case KVM_ARM_SET_DEVICE_ADDR: {
+ struct kvm_arm_device_addr dev_addr;
+
+ if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
+ return -EFAULT;
+ return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
+ }
+ case KVM_ARM_PREFERRED_TARGET: {
+ int err;
+ struct kvm_vcpu_init init;
+
+ err = kvm_vcpu_preferred_target(&init);
+ if (err)
+ return err;
+
+ if (copy_to_user(argp, &init, sizeof(init)))
+ return -EFAULT;
+
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+}
+
+static void cpu_init_hyp_mode(void)
+{
+ phys_addr_t pgd_ptr;
+ unsigned long hyp_stack_ptr;
+ unsigned long vector_ptr;
+ unsigned long tpidr_el2;
+
+ /* Switch from the HYP stub to our own HYP init vector */
+ __hyp_set_vectors(kvm_get_idmap_vector());
+
+ /*
+ * Calculate the raw per-cpu offset without a translation from the
+ * kernel's mapping to the linear mapping, and store it in tpidr_el2
+ * so that we can use adr_l to access per-cpu variables in EL2.
+ */
+ tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
+ (unsigned long)kvm_ksym_ref(kvm_host_data));
+
+ pgd_ptr = kvm_mmu_get_httbr();
+ hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
+ vector_ptr = (unsigned long)kvm_get_hyp_vector();
+
+ /*
+ * Call initialization code, and switch to the full blown HYP code.
+ * If the cpucaps haven't been finalized yet, something has gone very
+ * wrong, and hyp will crash and burn when it uses any
+ * cpus_have_const_cap() wrapper.
+ */
+ BUG_ON(!system_capabilities_finalized());
+ __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+
+ /*
+ * Disabling SSBD on a non-VHE system requires us to enable SSBS
+ * at EL2.
+ */
+ if (this_cpu_has_cap(ARM64_SSBS) &&
+ arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+ kvm_call_hyp(__kvm_enable_ssbs);
+ }
+}
+
+static void cpu_hyp_reset(void)
+{
+ if (!is_kernel_in_hyp_mode())
+ __hyp_reset_vectors();
+}
+
+static void cpu_hyp_reinit(void)
+{
+ kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+
+ cpu_hyp_reset();
+
+ if (is_kernel_in_hyp_mode())
+ kvm_timer_init_vhe();
+ else
+ cpu_init_hyp_mode();
+
+ kvm_arm_init_debug();
+
+ if (vgic_present)
+ kvm_vgic_init_cpu_hardware();
+}
+
+static void _kvm_arch_hardware_enable(void *discard)
+{
+ if (!__this_cpu_read(kvm_arm_hardware_enabled)) {
+ cpu_hyp_reinit();
+ __this_cpu_write(kvm_arm_hardware_enabled, 1);
+ }
+}
+
+int kvm_arch_hardware_enable(void)
+{
+ _kvm_arch_hardware_enable(NULL);
+ return 0;
+}
+
+static void _kvm_arch_hardware_disable(void *discard)
+{
+ if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+ cpu_hyp_reset();
+ __this_cpu_write(kvm_arm_hardware_enabled, 0);
+ }
+}
+
+void kvm_arch_hardware_disable(void)
+{
+ _kvm_arch_hardware_disable(NULL);
+}
+
+#ifdef CONFIG_CPU_PM
+static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
+ unsigned long cmd,
+ void *v)
+{
+ /*
+ * kvm_arm_hardware_enabled is left with its old value over
+ * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
+ * re-enable hyp.
+ */
+ switch (cmd) {
+ case CPU_PM_ENTER:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /*
+ * don't update kvm_arm_hardware_enabled here
+ * so that the hardware will be re-enabled
+ * when we resume. See below.
+ */
+ cpu_hyp_reset();
+
+ return NOTIFY_OK;
+ case CPU_PM_ENTER_FAILED:
+ case CPU_PM_EXIT:
+ if (__this_cpu_read(kvm_arm_hardware_enabled))
+ /* The hardware was enabled before suspend. */
+ cpu_hyp_reinit();
+
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block hyp_init_cpu_pm_nb = {
+ .notifier_call = hyp_init_cpu_pm_notifier,
+};
+
+static void __init hyp_cpu_pm_init(void)
+{
+ cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
+}
+static void __init hyp_cpu_pm_exit(void)
+{
+ cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
+}
+#else
+static inline void hyp_cpu_pm_init(void)
+{
+}
+static inline void hyp_cpu_pm_exit(void)
+{
+}
+#endif
+
+static int init_common_resources(void)
+{
+ return kvm_set_ipa_limit();
+}
+
+static int init_subsystems(void)
+{
+ int err = 0;
+
+ /*
+ * Enable hardware so that subsystem initialisation can access EL2.
+ */
+ on_each_cpu(_kvm_arch_hardware_enable, NULL, 1);
+
+ /*
+ * Register CPU lower-power notifier
+ */
+ hyp_cpu_pm_init();
+
+ /*
+ * Init HYP view of VGIC
+ */
+ err = kvm_vgic_hyp_init();
+ switch (err) {
+ case 0:
+ vgic_present = true;
+ break;
+ case -ENODEV:
+ case -ENXIO:
+ vgic_present = false;
+ err = 0;
+ break;
+ default:
+ goto out;
+ }
+
+ /*
+ * Init HYP architected timer support
+ */
+ err = kvm_timer_hyp_init(vgic_present);
+ if (err)
+ goto out;
+
+ kvm_perf_init();
+ kvm_coproc_table_init();
+
+out:
+ on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
+
+ return err;
+}
+
+static void teardown_hyp_mode(void)
+{
+ int cpu;
+
+ free_hyp_pgds();
+ for_each_possible_cpu(cpu)
+ free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+/**
+ * Inits Hyp-mode on all online CPUs
+ */
+static int init_hyp_mode(void)
+{
+ int cpu;
+ int err = 0;
+
+ /*
+ * Allocate Hyp PGD and setup Hyp identity mapping
+ */
+ err = kvm_mmu_init();
+ if (err)
+ goto out_err;
+
+ /*
+ * Allocate stack pages for Hypervisor-mode
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned long stack_page;
+
+ stack_page = __get_free_page(GFP_KERNEL);
+ if (!stack_page) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
+ }
+
+ /*
+ * Map the Hyp-code called directly from the host
+ */
+ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
+ kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
+ if (err) {
+ kvm_err("Cannot map world-switch code\n");
+ goto out_err;
+ }
+
+ err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
+ kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
+ if (err) {
+ kvm_err("Cannot map rodata section\n");
+ goto out_err;
+ }
+
+ err = create_hyp_mappings(kvm_ksym_ref(__bss_start),
+ kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
+ if (err) {
+ kvm_err("Cannot map bss section\n");
+ goto out_err;
+ }
+
+ err = kvm_map_vectors();
+ if (err) {
+ kvm_err("Cannot map vectors\n");
+ goto out_err;
+ }
+
+ /*
+ * Map the Hyp stack pages
+ */
+ for_each_possible_cpu(cpu) {
+ char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
+ err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+ PAGE_HYP);
+
+ if (err) {
+ kvm_err("Cannot map hyp stack\n");
+ goto out_err;
+ }
+ }
+
+ for_each_possible_cpu(cpu) {
+ kvm_host_data_t *cpu_data;
+
+ cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
+ err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
+
+ if (err) {
+ kvm_err("Cannot map host CPU state: %d\n", err);
+ goto out_err;
+ }
+ }
+
+ err = hyp_map_aux_data();
+ if (err)
+ kvm_err("Cannot map host auxiliary data: %d\n", err);
+
+ return 0;
+
+out_err:
+ teardown_hyp_mode();
+ kvm_err("error initializing Hyp mode: %d\n", err);
+ return err;
+}
+
+static void check_kvm_target_cpu(void *ret)
+{
+ *(int *)ret = kvm_target_cpu();
+}
+
+struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ mpidr &= MPIDR_HWID_BITMASK;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
+ return vcpu;
+ }
+ return NULL;
+}
+
+bool kvm_arch_has_irq_bypass(void)
+{
+ return true;
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
+ &irqfd->irq_entry);
+}
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
+ &irqfd->irq_entry);
+}
+
+void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ kvm_arm_halt_guest(irqfd->kvm);
+}
+
+void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ kvm_arm_resume_guest(irqfd->kvm);
+}
+
+/**
+ * Initialize Hyp-mode and memory mappings on all CPUs.
+ */
+int kvm_arch_init(void *opaque)
+{
+ int err;
+ int ret, cpu;
+ bool in_hyp_mode;
+
+ if (!is_hyp_mode_available()) {
+ kvm_info("HYP mode not available\n");
+ return -ENODEV;
+ }
+
+ in_hyp_mode = is_kernel_in_hyp_mode();
+
+ if (!in_hyp_mode && kvm_arch_requires_vhe()) {
+ kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
+ return -ENODEV;
+ }
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
+ if (ret < 0) {
+ kvm_err("Error, CPU %d not supported!\n", cpu);
+ return -ENODEV;
+ }
+ }
+
+ err = init_common_resources();
+ if (err)
+ return err;
+
+ err = kvm_arm_init_sve();
+ if (err)
+ return err;
+
+ if (!in_hyp_mode) {
+ err = init_hyp_mode();
+ if (err)
+ goto out_err;
+ }
+
+ err = init_subsystems();
+ if (err)
+ goto out_hyp;
+
+ if (in_hyp_mode)
+ kvm_info("VHE mode initialized successfully\n");
+ else
+ kvm_info("Hyp mode initialized successfully\n");
+
+ return 0;
+
+out_hyp:
+ hyp_cpu_pm_exit();
+ if (!in_hyp_mode)
+ teardown_hyp_mode();
+out_err:
+ return err;
+}
+
+/* NOP: Compiling as a module not supported */
+void kvm_arch_exit(void)
+{
+ kvm_perf_teardown();
+}
+
+static int arm_init(void)
+{
+ int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ return rc;
+}
+
+module_init(arm_init);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 23ebe51410f0..aea43ec60f37 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,20 +29,19 @@
#include "trace.h"
-#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM }
-#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU }
-
struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT(halt_successful_poll),
- VCPU_STAT(halt_attempted_poll),
- VCPU_STAT(halt_poll_invalid),
- VCPU_STAT(halt_wakeup),
- VCPU_STAT(hvc_exit_stat),
- VCPU_STAT(wfe_exit_stat),
- VCPU_STAT(wfi_exit_stat),
- VCPU_STAT(mmio_exit_user),
- VCPU_STAT(mmio_exit_kernel),
- VCPU_STAT(exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("hvc_exit_stat", hvc_exit_stat),
+ VCPU_STAT("wfe_exit_stat", wfe_exit_stat),
+ VCPU_STAT("wfi_exit_stat", wfi_exit_stat),
+ VCPU_STAT("mmio_exit_user", mmio_exit_user),
+ VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel),
+ VCPU_STAT("exits", exits),
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
{ NULL }
};
@@ -200,6 +199,13 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
}
memcpy((u32 *)regs + off, valp, KVM_REG_SIZE(reg->id));
+
+ if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
+ int i;
+
+ for (i = 0; i < 16; i++)
+ *vcpu_reg32(vcpu, i) = (u32)*vcpu_reg32(vcpu, i);
+ }
out:
return err;
}
@@ -260,7 +266,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
/*
* Vector lengths supported by the host can't currently be
* hidden from the guest individually: instead we can only set a
- * maxmium via ZCR_EL2.LEN. So, make sure the available vector
+ * maximum via ZCR_EL2.LEN. So, make sure the available vector
* lengths match the set requested exactly up to the requested
* maximum:
*/
@@ -330,7 +336,7 @@ static int sve_reg_to_region(struct sve_state_reg_region *region,
unsigned int reg_num;
unsigned int reqoffset, reqlen; /* User-requested offset and length */
- unsigned int maxlen; /* Maxmimum permitted length */
+ unsigned int maxlen; /* Maximum permitted length */
size_t sve_state_size;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index aacfc55de44c..eb194696ef62 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -23,7 +23,7 @@
#include <kvm/arm_hypercalls.h>
#define CREATE_TRACE_POINTS
-#include "trace.h"
+#include "trace_handle_exit.h"
typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index ea710f674cb6..8c9880783839 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -6,20 +6,10 @@
ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING \
$(DISABLE_STACKLEAK_PLUGIN)
-KVM=../../../../virt/kvm
+obj-$(CONFIG_KVM) += hyp.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o
-
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-cpuif-proxy.o
-obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += switch.o
-obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
-obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
-obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
+hyp-y := vgic-v3-sr.o timer-sr.o aarch32.o vgic-v2-cpuif-proxy.o sysreg-sr.o \
+ debug-sr.o entry.o switch.o fpsimd.o tlb.o hyp-entry.o
# KVM code is run at a different exception code with a different map, so
# compiler instrumentation that inserts callbacks or checks into the code may
diff --git a/arch/arm64/kvm/hyp/aarch32.c b/arch/arm64/kvm/hyp/aarch32.c
new file mode 100644
index 000000000000..25c0e47d57cb
--- /dev/null
+++ b/arch/arm64/kvm/hyp/aarch32.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
+ *
+ * Copyright (C) 2012,2013 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * based on arch/arm/kvm/emulate.c
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+
+/*
+ * stolen from arch/arm/kernel/opcodes.c
+ *
+ * condition code lookup table
+ * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
+ *
+ * bit position in short is condition code: NZCV
+ */
+static const unsigned short cc_map[16] = {
+ 0xF0F0, /* EQ == Z set */
+ 0x0F0F, /* NE */
+ 0xCCCC, /* CS == C set */
+ 0x3333, /* CC */
+ 0xFF00, /* MI == N set */
+ 0x00FF, /* PL */
+ 0xAAAA, /* VS == V set */
+ 0x5555, /* VC */
+ 0x0C0C, /* HI == C set && Z clear */
+ 0xF3F3, /* LS == C clear || Z set */
+ 0xAA55, /* GE == (N==V) */
+ 0x55AA, /* LT == (N!=V) */
+ 0x0A05, /* GT == (!Z && (N==V)) */
+ 0xF5FA, /* LE == (Z || (N!=V)) */
+ 0xFFFF, /* AL always */
+ 0 /* NV */
+};
+
+/*
+ * Check if a trapped instruction should have been executed or not.
+ */
+bool __hyp_text kvm_condition_valid32(const struct kvm_vcpu *vcpu)
+{
+ unsigned long cpsr;
+ u32 cpsr_cond;
+ int cond;
+
+ /* Top two bits non-zero? Unconditional. */
+ if (kvm_vcpu_get_hsr(vcpu) >> 30)
+ return true;
+
+ /* Is condition field valid? */
+ cond = kvm_vcpu_get_condition(vcpu);
+ if (cond == 0xE)
+ return true;
+
+ cpsr = *vcpu_cpsr(vcpu);
+
+ if (cond < 0) {
+ /* This can happen in Thumb mode: examine IT state. */
+ unsigned long it;
+
+ it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);
+
+ /* it == 0 => unconditional. */
+ if (it == 0)
+ return true;
+
+ /* The cond for this insn works out as the top 4 bits. */
+ cond = (it >> 4);
+ }
+
+ cpsr_cond = cpsr >> 28;
+
+ if (!((cc_map[cond] >> cpsr_cond) & 1))
+ return false;
+
+ return true;
+}
+
+/**
+ * adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
+ * @vcpu: The VCPU pointer
+ *
+ * When exceptions occur while instructions are executed in Thumb IF-THEN
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
+ * to do this little bit of work manually. The fields map like this:
+ *
+ * IT[7:0] -> CPSR[26:25],CPSR[15:10]
+ */
+static void __hyp_text kvm_adjust_itstate(struct kvm_vcpu *vcpu)
+{
+ unsigned long itbits, cond;
+ unsigned long cpsr = *vcpu_cpsr(vcpu);
+ bool is_arm = !(cpsr & PSR_AA32_T_BIT);
+
+ if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
+ return;
+
+ cond = (cpsr & 0xe000) >> 13;
+ itbits = (cpsr & 0x1c00) >> (10 - 2);
+ itbits |= (cpsr & (0x3 << 25)) >> 25;
+
+ /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
+ if ((itbits & 0x7) == 0)
+ itbits = cond = 0;
+ else
+ itbits = (itbits << 1) & 0x1f;
+
+ cpsr &= ~PSR_AA32_IT_MASK;
+ cpsr |= cond << 13;
+ cpsr |= (itbits & 0x1c) << (10 - 2);
+ cpsr |= (itbits & 0x3) << 25;
+ *vcpu_cpsr(vcpu) = cpsr;
+}
+
+/**
+ * kvm_skip_instr - skip a trapped instruction and proceed to the next
+ * @vcpu: The vcpu pointer
+ */
+void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
+{
+ u32 pc = *vcpu_pc(vcpu);
+ bool is_thumb;
+
+ is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
+ if (is_thumb && !is_wide_instr)
+ pc += 2;
+ else
+ pc += 4;
+
+ *vcpu_pc(vcpu) = pc;
+
+ kvm_adjust_itstate(vcpu);
+}
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index d22d0534dd60..90186cf6473e 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -18,6 +18,7 @@
#define CPU_GP_REG_OFFSET(x) (CPU_GP_REGS + x)
#define CPU_XREG_OFFSET(x) CPU_GP_REG_OFFSET(CPU_USER_PT_REGS + 8*x)
+#define CPU_SP_EL0_OFFSET (CPU_XREG_OFFSET(30) + 8)
.text
.pushsection .hyp.text, "ax"
@@ -47,6 +48,16 @@
ldp x29, lr, [\ctxt, #CPU_XREG_OFFSET(29)]
.endm
+.macro save_sp_el0 ctxt, tmp
+ mrs \tmp, sp_el0
+ str \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+ ldr \tmp, [\ctxt, #CPU_SP_EL0_OFFSET]
+ msr sp_el0, \tmp
+.endm
+
/*
* u64 __guest_enter(struct kvm_vcpu *vcpu,
* struct kvm_cpu_context *host_ctxt);
@@ -60,6 +71,9 @@ SYM_FUNC_START(__guest_enter)
// Store the host regs
save_callee_saved_regs x1
+ // Save the host's sp_el0
+ save_sp_el0 x1, x2
+
// Now the host state is stored if we have a pending RAS SError it must
// affect the host. If any asynchronous exception is pending we defer
// the guest entry. The DSB isn't necessary before v8.2 as any SError
@@ -83,6 +97,9 @@ alternative_else_nop_endif
// when this feature is enabled for kernel code.
ptrauth_switch_to_guest x29, x0, x1, x2
+ // Restore the guest's sp_el0
+ restore_sp_el0 x29, x0
+
// Restore guest regs x0-x17
ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)]
ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)]
@@ -130,6 +147,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// Store the guest regs x18-x29, lr
save_callee_saved_regs x1
+ // Store the guest's sp_el0
+ save_sp_el0 x1, x2
+
get_host_ctxt x2, x3
// Macro ptrauth_switch_to_guest format:
@@ -139,6 +159,9 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
// when this feature is enabled for kernel code.
ptrauth_switch_to_host x1, x2, x3, x4, x5
+ // Restore the hosts's sp_el0
+ restore_sp_el0 x2, x3
+
// Now restore the host regs
restore_callee_saved_regs x2
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index c2a13ab3c471..9c5cfb04170e 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -198,7 +198,6 @@ SYM_CODE_END(__hyp_panic)
.macro invalid_vector label, target = __hyp_panic
.align 2
SYM_CODE_START(\label)
-\label:
b \target
SYM_CODE_END(\label)
.endm
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 8a1e81a400e0..676b6585e5ae 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -138,7 +138,7 @@ static void __hyp_text __activate_traps_nvhe(struct kvm_vcpu *vcpu)
write_sysreg(val, cptr_el2);
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
isb();
@@ -181,7 +181,7 @@ static void deactivate_traps_vhe(void)
* above before we can switch to the EL2/EL0 translation regime used by
* the host.
*/
- asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT_VHE));
+ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1);
write_sysreg(vectors, vbar_el1);
@@ -192,7 +192,7 @@ static void __hyp_text __deactivate_traps_nvhe(void)
{
u64 mdcr_el2 = read_sysreg(mdcr_el2);
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
/*
@@ -270,8 +270,8 @@ static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
{
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
- __vgic_v3_save_state(vcpu);
- __vgic_v3_deactivate_traps(vcpu);
+ __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
+ __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
}
}
@@ -279,8 +279,8 @@ static void __hyp_text __hyp_vgic_save_state(struct kvm_vcpu *vcpu)
static void __hyp_text __hyp_vgic_restore_state(struct kvm_vcpu *vcpu)
{
if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
- __vgic_v3_activate_traps(vcpu);
- __vgic_v3_restore_state(vcpu);
+ __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
+ __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
}
}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 75b1925763f1..ea5d22fbdacf 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -15,8 +15,9 @@
/*
* Non-VHE: Both host and guest must save everything.
*
- * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and pstate,
- * which are handled as part of the el2 return state) on every switch.
+ * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
+ * pstate, which are handled as part of the el2 return state) on every
+ * switch (sp_el0 is being dealt with in the assembly code).
* tpidr_el0 and tpidrro_el0 only need to be switched when going
* to host userspace or a different VCPU. EL1 registers only need to be
* switched when potentially going to run a different VCPU. The latter two
@@ -26,12 +27,6 @@
static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
ctxt->sys_regs[MDSCR_EL1] = read_sysreg(mdscr_el1);
-
- /*
- * The host arm64 Linux uses sp_el0 to point to 'current' and it must
- * therefore be saved/restored on every entry/exit to/from the guest.
- */
- ctxt->gp_regs.regs.sp = read_sysreg(sp_el0);
}
static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
@@ -99,12 +94,6 @@ NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe);
static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
write_sysreg(ctxt->sys_regs[MDSCR_EL1], mdscr_el1);
-
- /*
- * The host arm64 Linux uses sp_el0 to point to 'current' and it must
- * therefore be saved/restored on every entry/exit to/from the guest.
- */
- write_sysreg(ctxt->gp_regs.regs.sp, sp_el0);
}
static void __hyp_text __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
@@ -118,7 +107,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt->sys_regs[MPIDR_EL1], vmpidr_el2);
write_sysreg(ctxt->sys_regs[CSSELR_EL1], csselr_el1);
- if (!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) {
+ if (has_vhe() ||
+ !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1], SYS_SCTLR);
write_sysreg_el1(ctxt->sys_regs[TCR_EL1], SYS_TCR);
} else if (!ctxt->__hyp_running_vcpu) {
@@ -149,7 +139,8 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg(ctxt->sys_regs[PAR_EL1], par_el1);
write_sysreg(ctxt->sys_regs[TPIDR_EL1], tpidr_el1);
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE) &&
+ if (!has_vhe() &&
+ cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
ctxt->__hyp_running_vcpu) {
/*
* Must only be done for host registers, hence the context
diff --git a/arch/arm64/kvm/hyp/timer-sr.c b/arch/arm64/kvm/hyp/timer-sr.c
new file mode 100644
index 000000000000..fb5c0be33223
--- /dev/null
+++ b/arch/arm64/kvm/hyp/timer-sr.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <clocksource/arm_arch_timer.h>
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_hyp.h>
+
+void __hyp_text __kvm_timer_set_cntvoff(u64 cntvoff)
+{
+ write_sysreg(cntvoff, cntvoff_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ /* Allow physical timer/counter access for the host */
+ val = read_sysreg(cnthctl_el2);
+ val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN;
+ write_sysreg(val, cnthctl_el2);
+}
+
+/*
+ * Should only be called on non-VHE systems.
+ * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe().
+ */
+void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu)
+{
+ u64 val;
+
+ /*
+ * Disallow physical timer access for the guest
+ * Physical counter access is allowed
+ */
+ val = read_sysreg(cnthctl_el2);
+ val &= ~CNTHCTL_EL1PCEN;
+ val |= CNTHCTL_EL1PCTEN;
+ write_sysreg(val, cnthctl_el2);
+}
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index ceaddbe4279f..d063a576d511 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -23,7 +23,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm,
local_irq_save(cxt->flags);
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/*
* For CPUs that are affected by ARM errata 1165522 or 1530923,
* we cannot trust stage-1 to be in a correct state at that
@@ -63,7 +63,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm,
static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm,
struct tlb_inv_context *cxt)
{
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
u64 val;
/*
@@ -79,8 +79,9 @@ static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm,
isb();
}
+ /* __load_guest_stage2() includes an ISB for the workaround. */
__load_guest_stage2(kvm);
- isb();
+ asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT));
}
static void __hyp_text __tlb_switch_to_guest(struct kvm *kvm,
@@ -103,7 +104,7 @@ static void __hyp_text __tlb_switch_to_host_vhe(struct kvm *kvm,
write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
isb();
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_VHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/* Restore the registers to what they were */
write_sysreg_el1(cxt->tcr, SYS_TCR);
write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
@@ -117,7 +118,7 @@ static void __hyp_text __tlb_switch_to_host_nvhe(struct kvm *kvm,
{
write_sysreg(0, vttbr_el2);
- if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT_NVHE)) {
+ if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
/* Ensure write of the host VMID */
isb();
/* Restore the host's TCR_EL1 */
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
new file mode 100644
index 000000000000..10ed539835c1
--- /dev/null
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -0,0 +1,1113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012-2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+#define vtr_to_max_lr_idx(v) ((v) & 0xf)
+#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
+#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
+
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+ switch (lr & 0xf) {
+ case 0:
+ return read_gicreg(ICH_LR0_EL2);
+ case 1:
+ return read_gicreg(ICH_LR1_EL2);
+ case 2:
+ return read_gicreg(ICH_LR2_EL2);
+ case 3:
+ return read_gicreg(ICH_LR3_EL2);
+ case 4:
+ return read_gicreg(ICH_LR4_EL2);
+ case 5:
+ return read_gicreg(ICH_LR5_EL2);
+ case 6:
+ return read_gicreg(ICH_LR6_EL2);
+ case 7:
+ return read_gicreg(ICH_LR7_EL2);
+ case 8:
+ return read_gicreg(ICH_LR8_EL2);
+ case 9:
+ return read_gicreg(ICH_LR9_EL2);
+ case 10:
+ return read_gicreg(ICH_LR10_EL2);
+ case 11:
+ return read_gicreg(ICH_LR11_EL2);
+ case 12:
+ return read_gicreg(ICH_LR12_EL2);
+ case 13:
+ return read_gicreg(ICH_LR13_EL2);
+ case 14:
+ return read_gicreg(ICH_LR14_EL2);
+ case 15:
+ return read_gicreg(ICH_LR15_EL2);
+ }
+
+ unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+ switch (lr & 0xf) {
+ case 0:
+ write_gicreg(val, ICH_LR0_EL2);
+ break;
+ case 1:
+ write_gicreg(val, ICH_LR1_EL2);
+ break;
+ case 2:
+ write_gicreg(val, ICH_LR2_EL2);
+ break;
+ case 3:
+ write_gicreg(val, ICH_LR3_EL2);
+ break;
+ case 4:
+ write_gicreg(val, ICH_LR4_EL2);
+ break;
+ case 5:
+ write_gicreg(val, ICH_LR5_EL2);
+ break;
+ case 6:
+ write_gicreg(val, ICH_LR6_EL2);
+ break;
+ case 7:
+ write_gicreg(val, ICH_LR7_EL2);
+ break;
+ case 8:
+ write_gicreg(val, ICH_LR8_EL2);
+ break;
+ case 9:
+ write_gicreg(val, ICH_LR9_EL2);
+ break;
+ case 10:
+ write_gicreg(val, ICH_LR10_EL2);
+ break;
+ case 11:
+ write_gicreg(val, ICH_LR11_EL2);
+ break;
+ case 12:
+ write_gicreg(val, ICH_LR12_EL2);
+ break;
+ case 13:
+ write_gicreg(val, ICH_LR13_EL2);
+ break;
+ case 14:
+ write_gicreg(val, ICH_LR14_EL2);
+ break;
+ case 15:
+ write_gicreg(val, ICH_LR15_EL2);
+ break;
+ }
+}
+
+static void __hyp_text __vgic_v3_write_ap0rn(u32 val, int n)
+{
+ switch (n) {
+ case 0:
+ write_gicreg(val, ICH_AP0R0_EL2);
+ break;
+ case 1:
+ write_gicreg(val, ICH_AP0R1_EL2);
+ break;
+ case 2:
+ write_gicreg(val, ICH_AP0R2_EL2);
+ break;
+ case 3:
+ write_gicreg(val, ICH_AP0R3_EL2);
+ break;
+ }
+}
+
+static void __hyp_text __vgic_v3_write_ap1rn(u32 val, int n)
+{
+ switch (n) {
+ case 0:
+ write_gicreg(val, ICH_AP1R0_EL2);
+ break;
+ case 1:
+ write_gicreg(val, ICH_AP1R1_EL2);
+ break;
+ case 2:
+ write_gicreg(val, ICH_AP1R2_EL2);
+ break;
+ case 3:
+ write_gicreg(val, ICH_AP1R3_EL2);
+ break;
+ }
+}
+
+static u32 __hyp_text __vgic_v3_read_ap0rn(int n)
+{
+ u32 val;
+
+ switch (n) {
+ case 0:
+ val = read_gicreg(ICH_AP0R0_EL2);
+ break;
+ case 1:
+ val = read_gicreg(ICH_AP0R1_EL2);
+ break;
+ case 2:
+ val = read_gicreg(ICH_AP0R2_EL2);
+ break;
+ case 3:
+ val = read_gicreg(ICH_AP0R3_EL2);
+ break;
+ default:
+ unreachable();
+ }
+
+ return val;
+}
+
+static u32 __hyp_text __vgic_v3_read_ap1rn(int n)
+{
+ u32 val;
+
+ switch (n) {
+ case 0:
+ val = read_gicreg(ICH_AP1R0_EL2);
+ break;
+ case 1:
+ val = read_gicreg(ICH_AP1R1_EL2);
+ break;
+ case 2:
+ val = read_gicreg(ICH_AP1R2_EL2);
+ break;
+ case 3:
+ val = read_gicreg(ICH_AP1R3_EL2);
+ break;
+ default:
+ unreachable();
+ }
+
+ return val;
+}
+
+void __hyp_text __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
+{
+ u64 used_lrs = cpu_if->used_lrs;
+
+ /*
+ * Make sure stores to the GIC via the memory mapped interface
+ * are now visible to the system register interface when reading the
+ * LRs, and when reading back the VMCR on non-VHE systems.
+ */
+ if (used_lrs || !has_vhe()) {
+ if (!cpu_if->vgic_sre) {
+ dsb(sy);
+ isb();
+ }
+ }
+
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
+ int i;
+ u32 elrsr;
+
+ elrsr = read_gicreg(ICH_ELRSR_EL2);
+
+ write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
+
+ for (i = 0; i < used_lrs; i++) {
+ if (elrsr & (1 << i))
+ cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+ else
+ cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+
+ __gic_v3_set_lr(0, i);
+ }
+ }
+}
+
+void __hyp_text __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
+{
+ u64 used_lrs = cpu_if->used_lrs;
+ int i;
+
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
+ write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+ for (i = 0; i < used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+ }
+
+ /*
+ * Ensure that writes to the LRs, and on non-VHE systems ensure that
+ * the write to the VMCR in __vgic_v3_activate_traps(), will have
+ * reached the (re)distributors. This ensure the guest will read the
+ * correct values from the memory-mapped interface.
+ */
+ if (used_lrs || !has_vhe()) {
+ if (!cpu_if->vgic_sre) {
+ isb();
+ dsb(sy);
+ }
+ }
+}
+
+void __hyp_text __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
+{
+ /*
+ * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
+ * Group0 interrupt (as generated in GICv2 mode) to be
+ * delivered as a FIQ to the guest, with potentially fatal
+ * consequences. So we must make sure that ICC_SRE_EL1 has
+ * been actually programmed with the value we want before
+ * starting to mess with the rest of the GIC, and VMCR_EL2 in
+ * particular. This logic must be called before
+ * __vgic_v3_restore_state().
+ */
+ if (!cpu_if->vgic_sre) {
+ write_gicreg(0, ICC_SRE_EL1);
+ isb();
+ write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
+
+
+ if (has_vhe()) {
+ /*
+ * Ensure that the write to the VMCR will have reached
+ * the (re)distributors. This ensure the guest will
+ * read the correct values from the memory-mapped
+ * interface.
+ */
+ isb();
+ dsb(sy);
+ }
+ }
+
+ /*
+ * Prevent the guest from touching the GIC system registers if
+ * SRE isn't enabled for GICv3 emulation.
+ */
+ write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+ ICC_SRE_EL2);
+
+ /*
+ * If we need to trap system registers, we must write
+ * ICH_HCR_EL2 anyway, even if no interrupts are being
+ * injected,
+ */
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+ cpu_if->its_vpe.its_vm)
+ write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
+{
+ u64 val;
+
+ if (!cpu_if->vgic_sre) {
+ cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+ }
+
+ val = read_gicreg(ICC_SRE_EL2);
+ write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+
+ if (!cpu_if->vgic_sre) {
+ /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
+ isb();
+ write_gicreg(1, ICC_SRE_EL1);
+ }
+
+ /*
+ * If we were trapping system registers, we enabled the VGIC even if
+ * no interrupts were being injected, and we disable it again here.
+ */
+ if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
+ cpu_if->its_vpe.its_vm)
+ write_gicreg(0, ICH_HCR_EL2);
+}
+
+void __hyp_text __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+ u64 val;
+ u32 nr_pre_bits;
+
+ val = read_gicreg(ICH_VTR_EL2);
+ nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+ switch (nr_pre_bits) {
+ case 7:
+ cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
+ cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
+ /* Fall through */
+ case 6:
+ cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
+ /* Fall through */
+ default:
+ cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
+ }
+
+ switch (nr_pre_bits) {
+ case 7:
+ cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
+ cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
+ /* Fall through */
+ case 6:
+ cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
+ /* Fall through */
+ default:
+ cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
+ }
+}
+
+void __hyp_text __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+ u64 val;
+ u32 nr_pre_bits;
+
+ val = read_gicreg(ICH_VTR_EL2);
+ nr_pre_bits = vtr_to_nr_pre_bits(val);
+
+ switch (nr_pre_bits) {
+ case 7:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
+ /* Fall through */
+ case 6:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
+ /* Fall through */
+ default:
+ __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
+ }
+
+ switch (nr_pre_bits) {
+ case 7:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
+ /* Fall through */
+ case 6:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
+ /* Fall through */
+ default:
+ __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
+ }
+}
+
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+ int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+ int i;
+
+ for (i = 0; i <= max_lr_idx; i++)
+ __gic_v3_set_lr(0, i);
+}
+
+u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void)
+{
+ return read_gicreg(ICH_VTR_EL2);
+}
+
+u64 __hyp_text __vgic_v3_read_vmcr(void)
+{
+ return read_gicreg(ICH_VMCR_EL2);
+}
+
+void __hyp_text __vgic_v3_write_vmcr(u32 vmcr)
+{
+ write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static int __hyp_text __vgic_v3_bpr_min(void)
+{
+ /* See Pseudocode for VPriorityGroup */
+ return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
+}
+
+static int __hyp_text __vgic_v3_get_group(struct kvm_vcpu *vcpu)
+{
+ u32 esr = kvm_vcpu_get_hsr(vcpu);
+ u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
+
+ return crm != 8;
+}
+
+#define GICv3_IDLE_PRIORITY 0xff
+
+static int __hyp_text __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu,
+ u32 vmcr,
+ u64 *lr_val)
+{
+ unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+ u8 priority = GICv3_IDLE_PRIORITY;
+ int i, lr = -1;
+
+ for (i = 0; i < used_lrs; i++) {
+ u64 val = __gic_v3_get_lr(i);
+ u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+ /* Not pending in the state? */
+ if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
+ continue;
+
+ /* Group-0 interrupt, but Group-0 disabled? */
+ if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
+ continue;
+
+ /* Group-1 interrupt, but Group-1 disabled? */
+ if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
+ continue;
+
+ /* Not the highest priority? */
+ if (lr_prio >= priority)
+ continue;
+
+ /* This is a candidate */
+ priority = lr_prio;
+ *lr_val = val;
+ lr = i;
+ }
+
+ if (lr == -1)
+ *lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+ return lr;
+}
+
+static int __hyp_text __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu,
+ int intid, u64 *lr_val)
+{
+ unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+ int i;
+
+ for (i = 0; i < used_lrs; i++) {
+ u64 val = __gic_v3_get_lr(i);
+
+ if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
+ (val & ICH_LR_ACTIVE_BIT)) {
+ *lr_val = val;
+ return i;
+ }
+ }
+
+ *lr_val = ICC_IAR1_EL1_SPURIOUS;
+ return -1;
+}
+
+static int __hyp_text __vgic_v3_get_highest_active_priority(void)
+{
+ u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+ u32 hap = 0;
+ int i;
+
+ for (i = 0; i < nr_apr_regs; i++) {
+ u32 val;
+
+ /*
+ * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
+ * contain the active priority levels for this VCPU
+ * for the maximum number of supported priority
+ * levels, and we return the full priority level only
+ * if the BPR is programmed to its minimum, otherwise
+ * we return a combination of the priority level and
+ * subpriority, as determined by the setting of the
+ * BPR, but without the full subpriority.
+ */
+ val = __vgic_v3_read_ap0rn(i);
+ val |= __vgic_v3_read_ap1rn(i);
+ if (!val) {
+ hap += 32;
+ continue;
+ }
+
+ return (hap + __ffs(val)) << __vgic_v3_bpr_min();
+ }
+
+ return GICv3_IDLE_PRIORITY;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr0(u32 vmcr)
+{
+ return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+}
+
+static unsigned int __hyp_text __vgic_v3_get_bpr1(u32 vmcr)
+{
+ unsigned int bpr;
+
+ if (vmcr & ICH_VMCR_CBPR_MASK) {
+ bpr = __vgic_v3_get_bpr0(vmcr);
+ if (bpr < 7)
+ bpr++;
+ } else {
+ bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+ }
+
+ return bpr;
+}
+
+/*
+ * Convert a priority to a preemption level, taking the relevant BPR
+ * into account by zeroing the sub-priority bits.
+ */
+static u8 __hyp_text __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
+{
+ unsigned int bpr;
+
+ if (!grp)
+ bpr = __vgic_v3_get_bpr0(vmcr) + 1;
+ else
+ bpr = __vgic_v3_get_bpr1(vmcr);
+
+ return pri & (GENMASK(7, 0) << bpr);
+}
+
+/*
+ * The priority value is independent of any of the BPR values, so we
+ * normalize it using the minimal BPR value. This guarantees that no
+ * matter what the guest does with its BPR, we can always set/get the
+ * same value of a priority.
+ */
+static void __hyp_text __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
+{
+ u8 pre, ap;
+ u32 val;
+ int apr;
+
+ pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
+ ap = pre >> __vgic_v3_bpr_min();
+ apr = ap / 32;
+
+ if (!grp) {
+ val = __vgic_v3_read_ap0rn(apr);
+ __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
+ } else {
+ val = __vgic_v3_read_ap1rn(apr);
+ __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
+ }
+}
+
+static int __hyp_text __vgic_v3_clear_highest_active_priority(void)
+{
+ u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
+ u32 hap = 0;
+ int i;
+
+ for (i = 0; i < nr_apr_regs; i++) {
+ u32 ap0, ap1;
+ int c0, c1;
+
+ ap0 = __vgic_v3_read_ap0rn(i);
+ ap1 = __vgic_v3_read_ap1rn(i);
+ if (!ap0 && !ap1) {
+ hap += 32;
+ continue;
+ }
+
+ c0 = ap0 ? __ffs(ap0) : 32;
+ c1 = ap1 ? __ffs(ap1) : 32;
+
+ /* Always clear the LSB, which is the highest priority */
+ if (c0 < c1) {
+ ap0 &= ~BIT(c0);
+ __vgic_v3_write_ap0rn(ap0, i);
+ hap += c0;
+ } else {
+ ap1 &= ~BIT(c1);
+ __vgic_v3_write_ap1rn(ap1, i);
+ hap += c1;
+ }
+
+ /* Rescale to 8 bits of priority */
+ return hap << __vgic_v3_bpr_min();
+ }
+
+ return GICv3_IDLE_PRIORITY;
+}
+
+static void __hyp_text __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u64 lr_val;
+ u8 lr_prio, pmr;
+ int lr, grp;
+
+ grp = __vgic_v3_get_group(vcpu);
+
+ lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+ if (lr < 0)
+ goto spurious;
+
+ if (grp != !!(lr_val & ICH_LR_GROUP))
+ goto spurious;
+
+ pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+ lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+ if (pmr <= lr_prio)
+ goto spurious;
+
+ if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
+ goto spurious;
+
+ lr_val &= ~ICH_LR_STATE;
+ /* No active state for LPIs */
+ if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI)
+ lr_val |= ICH_LR_ACTIVE_BIT;
+ __gic_v3_set_lr(lr_val, lr);
+ __vgic_v3_set_active_priority(lr_prio, vmcr, grp);
+ vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+ return;
+
+spurious:
+ vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
+}
+
+static void __hyp_text __vgic_v3_clear_active_lr(int lr, u64 lr_val)
+{
+ lr_val &= ~ICH_LR_ACTIVE_BIT;
+ if (lr_val & ICH_LR_HW) {
+ u32 pid;
+
+ pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
+ gic_write_dir(pid);
+ }
+
+ __gic_v3_set_lr(lr_val, lr);
+}
+
+static void __hyp_text __vgic_v3_bump_eoicount(void)
+{
+ u32 hcr;
+
+ hcr = read_gicreg(ICH_HCR_EL2);
+ hcr += 1 << ICH_HCR_EOIcount_SHIFT;
+ write_gicreg(hcr, ICH_HCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_write_dir(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u32 vid = vcpu_get_reg(vcpu, rt);
+ u64 lr_val;
+ int lr;
+
+ /* EOImode == 0, nothing to be done here */
+ if (!(vmcr & ICH_VMCR_EOIM_MASK))
+ return;
+
+ /* No deactivate to be performed on an LPI */
+ if (vid >= VGIC_MIN_LPI)
+ return;
+
+ lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+ if (lr == -1) {
+ __vgic_v3_bump_eoicount();
+ return;
+ }
+
+ __vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u32 vid = vcpu_get_reg(vcpu, rt);
+ u64 lr_val;
+ u8 lr_prio, act_prio;
+ int lr, grp;
+
+ grp = __vgic_v3_get_group(vcpu);
+
+ /* Drop priority in any case */
+ act_prio = __vgic_v3_clear_highest_active_priority();
+
+ /* If EOIing an LPI, no deactivate to be performed */
+ if (vid >= VGIC_MIN_LPI)
+ return;
+
+ /* EOImode == 1, nothing to be done here */
+ if (vmcr & ICH_VMCR_EOIM_MASK)
+ return;
+
+ lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
+ if (lr == -1) {
+ __vgic_v3_bump_eoicount();
+ return;
+ }
+
+ lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
+
+ /* If priorities or group do not match, the guest has fscked-up. */
+ if (grp != !!(lr_val & ICH_LR_GROUP) ||
+ __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
+ return;
+
+ /* Let's now perform the deactivation */
+ __vgic_v3_clear_active_lr(lr, lr_val);
+}
+
+static void __hyp_text __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
+}
+
+static void __hyp_text __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
+}
+
+static void __hyp_text __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u64 val = vcpu_get_reg(vcpu, rt);
+
+ if (val & 1)
+ vmcr |= ICH_VMCR_ENG0_MASK;
+ else
+ vmcr &= ~ICH_VMCR_ENG0_MASK;
+
+ __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u64 val = vcpu_get_reg(vcpu, rt);
+
+ if (val & 1)
+ vmcr |= ICH_VMCR_ENG1_MASK;
+ else
+ vmcr &= ~ICH_VMCR_ENG1_MASK;
+
+ __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
+}
+
+static void __hyp_text __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
+}
+
+static void __hyp_text __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u64 val = vcpu_get_reg(vcpu, rt);
+ u8 bpr_min = __vgic_v3_bpr_min() - 1;
+
+ /* Enforce BPR limiting */
+ if (val < bpr_min)
+ val = bpr_min;
+
+ val <<= ICH_VMCR_BPR0_SHIFT;
+ val &= ICH_VMCR_BPR0_MASK;
+ vmcr &= ~ICH_VMCR_BPR0_MASK;
+ vmcr |= val;
+
+ __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ u64 val = vcpu_get_reg(vcpu, rt);
+ u8 bpr_min = __vgic_v3_bpr_min();
+
+ if (vmcr & ICH_VMCR_CBPR_MASK)
+ return;
+
+ /* Enforce BPR limiting */
+ if (val < bpr_min)
+ val = bpr_min;
+
+ val <<= ICH_VMCR_BPR1_SHIFT;
+ val &= ICH_VMCR_BPR1_MASK;
+ vmcr &= ~ICH_VMCR_BPR1_MASK;
+ vmcr |= val;
+
+ __vgic_v3_write_vmcr(vmcr);
+}
+
+static void __hyp_text __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+ u32 val;
+
+ if (!__vgic_v3_get_group(vcpu))
+ val = __vgic_v3_read_ap0rn(n);
+ else
+ val = __vgic_v3_read_ap1rn(n);
+
+ vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
+{
+ u32 val = vcpu_get_reg(vcpu, rt);
+
+ if (!__vgic_v3_get_group(vcpu))
+ __vgic_v3_write_ap0rn(val, n);
+ else
+ __vgic_v3_write_ap1rn(val, n);
+}
+
+static void __hyp_text __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_read_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_read_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_read_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_read_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_write_apxrn(vcpu, rt, 0);
+}
+
+static void __hyp_text __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_write_apxrn(vcpu, rt, 1);
+}
+
+static void __hyp_text __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_write_apxrn(vcpu, rt, 2);
+}
+
+static void __hyp_text __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ __vgic_v3_write_apxrn(vcpu, rt, 3);
+}
+
+static void __hyp_text __vgic_v3_read_hppir(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u64 lr_val;
+ int lr, lr_grp, grp;
+
+ grp = __vgic_v3_get_group(vcpu);
+
+ lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
+ if (lr == -1)
+ goto spurious;
+
+ lr_grp = !!(lr_val & ICH_LR_GROUP);
+ if (lr_grp != grp)
+ lr_val = ICC_IAR1_EL1_SPURIOUS;
+
+spurious:
+ vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
+}
+
+static void __hyp_text __vgic_v3_read_pmr(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ vmcr &= ICH_VMCR_PMR_MASK;
+ vmcr >>= ICH_VMCR_PMR_SHIFT;
+ vcpu_set_reg(vcpu, rt, vmcr);
+}
+
+static void __hyp_text __vgic_v3_write_pmr(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u32 val = vcpu_get_reg(vcpu, rt);
+
+ val <<= ICH_VMCR_PMR_SHIFT;
+ val &= ICH_VMCR_PMR_MASK;
+ vmcr &= ~ICH_VMCR_PMR_MASK;
+ vmcr |= val;
+
+ write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+static void __hyp_text __vgic_v3_read_rpr(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u32 val = __vgic_v3_get_highest_active_priority();
+ vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u32 vtr, val;
+
+ vtr = read_gicreg(ICH_VTR_EL2);
+ /* PRIbits */
+ val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
+ /* IDbits */
+ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+ /* SEIS */
+ val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT;
+ /* A3V */
+ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
+ /* EOImode */
+ val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
+ /* CBPR */
+ val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+
+ vcpu_set_reg(vcpu, rt, val);
+}
+
+static void __hyp_text __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu,
+ u32 vmcr, int rt)
+{
+ u32 val = vcpu_get_reg(vcpu, rt);
+
+ if (val & ICC_CTLR_EL1_CBPR_MASK)
+ vmcr |= ICH_VMCR_CBPR_MASK;
+ else
+ vmcr &= ~ICH_VMCR_CBPR_MASK;
+
+ if (val & ICC_CTLR_EL1_EOImode_MASK)
+ vmcr |= ICH_VMCR_EOIM_MASK;
+ else
+ vmcr &= ~ICH_VMCR_EOIM_MASK;
+
+ write_gicreg(vmcr, ICH_VMCR_EL2);
+}
+
+int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
+{
+ int rt;
+ u32 esr;
+ u32 vmcr;
+ void (*fn)(struct kvm_vcpu *, u32, int);
+ bool is_read;
+ u32 sysreg;
+
+ esr = kvm_vcpu_get_hsr(vcpu);
+ if (vcpu_mode_is_32bit(vcpu)) {
+ if (!kvm_condition_valid(vcpu)) {
+ __kvm_skip_instr(vcpu);
+ return 1;
+ }
+
+ sysreg = esr_cp15_to_sysreg(esr);
+ } else {
+ sysreg = esr_sys64_to_sysreg(esr);
+ }
+
+ is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;
+
+ switch (sysreg) {
+ case SYS_ICC_IAR0_EL1:
+ case SYS_ICC_IAR1_EL1:
+ if (unlikely(!is_read))
+ return 0;
+ fn = __vgic_v3_read_iar;
+ break;
+ case SYS_ICC_EOIR0_EL1:
+ case SYS_ICC_EOIR1_EL1:
+ if (unlikely(is_read))
+ return 0;
+ fn = __vgic_v3_write_eoir;
+ break;
+ case SYS_ICC_IGRPEN1_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_igrpen1;
+ else
+ fn = __vgic_v3_write_igrpen1;
+ break;
+ case SYS_ICC_BPR1_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_bpr1;
+ else
+ fn = __vgic_v3_write_bpr1;
+ break;
+ case SYS_ICC_AP0Rn_EL1(0):
+ case SYS_ICC_AP1Rn_EL1(0):
+ if (is_read)
+ fn = __vgic_v3_read_apxr0;
+ else
+ fn = __vgic_v3_write_apxr0;
+ break;
+ case SYS_ICC_AP0Rn_EL1(1):
+ case SYS_ICC_AP1Rn_EL1(1):
+ if (is_read)
+ fn = __vgic_v3_read_apxr1;
+ else
+ fn = __vgic_v3_write_apxr1;
+ break;
+ case SYS_ICC_AP0Rn_EL1(2):
+ case SYS_ICC_AP1Rn_EL1(2):
+ if (is_read)
+ fn = __vgic_v3_read_apxr2;
+ else
+ fn = __vgic_v3_write_apxr2;
+ break;
+ case SYS_ICC_AP0Rn_EL1(3):
+ case SYS_ICC_AP1Rn_EL1(3):
+ if (is_read)
+ fn = __vgic_v3_read_apxr3;
+ else
+ fn = __vgic_v3_write_apxr3;
+ break;
+ case SYS_ICC_HPPIR0_EL1:
+ case SYS_ICC_HPPIR1_EL1:
+ if (unlikely(!is_read))
+ return 0;
+ fn = __vgic_v3_read_hppir;
+ break;
+ case SYS_ICC_IGRPEN0_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_igrpen0;
+ else
+ fn = __vgic_v3_write_igrpen0;
+ break;
+ case SYS_ICC_BPR0_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_bpr0;
+ else
+ fn = __vgic_v3_write_bpr0;
+ break;
+ case SYS_ICC_DIR_EL1:
+ if (unlikely(is_read))
+ return 0;
+ fn = __vgic_v3_write_dir;
+ break;
+ case SYS_ICC_RPR_EL1:
+ if (unlikely(!is_read))
+ return 0;
+ fn = __vgic_v3_read_rpr;
+ break;
+ case SYS_ICC_CTLR_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_ctlr;
+ else
+ fn = __vgic_v3_write_ctlr;
+ break;
+ case SYS_ICC_PMR_EL1:
+ if (is_read)
+ fn = __vgic_v3_read_pmr;
+ else
+ fn = __vgic_v3_write_pmr;
+ break;
+ default:
+ return 0;
+ }
+
+ vmcr = __vgic_v3_read_vmcr();
+ rt = kvm_vcpu_sys_get_rt(vcpu);
+ fn(vcpu, vmcr, rt);
+
+ __kvm_skip_instr(vcpu);
+
+ return 1;
+}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
new file mode 100644
index 000000000000..550dfa3e53cd
--- /dev/null
+++ b/arch/arm64/kvm/hypercalls.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_hypercalls.h>
+#include <kvm/arm_psci.h>
+
+int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
+{
+ u32 func_id = smccc_get_function(vcpu);
+ long val = SMCCC_RET_NOT_SUPPORTED;
+ u32 feature;
+ gpa_t gpa;
+
+ switch (func_id) {
+ case ARM_SMCCC_VERSION_FUNC_ID:
+ val = ARM_SMCCC_VERSION_1_1;
+ break;
+ case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+ feature = smccc_get_arg1(vcpu);
+ switch (feature) {
+ case ARM_SMCCC_ARCH_WORKAROUND_1:
+ switch (kvm_arm_harden_branch_predictor()) {
+ case KVM_BP_HARDEN_UNKNOWN:
+ break;
+ case KVM_BP_HARDEN_WA_NEEDED:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ case KVM_BP_HARDEN_NOT_REQUIRED:
+ val = SMCCC_RET_NOT_REQUIRED;
+ break;
+ }
+ break;
+ case ARM_SMCCC_ARCH_WORKAROUND_2:
+ switch (kvm_arm_have_ssbd()) {
+ case KVM_SSBD_FORCE_DISABLE:
+ case KVM_SSBD_UNKNOWN:
+ break;
+ case KVM_SSBD_KERNEL:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ case KVM_SSBD_FORCE_ENABLE:
+ case KVM_SSBD_MITIGATED:
+ val = SMCCC_RET_NOT_REQUIRED;
+ break;
+ }
+ break;
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ }
+ break;
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ val = kvm_hypercall_pv_features(vcpu);
+ break;
+ case ARM_SMCCC_HV_PV_TIME_ST:
+ gpa = kvm_init_stolen_time(vcpu);
+ if (gpa != GPA_INVALID)
+ val = gpa;
+ break;
+ default:
+ return kvm_psci_call(vcpu);
+ }
+
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return 1;
+}
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 6aafc2825c1c..e21fdd93027a 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -26,28 +26,12 @@ enum exception_type {
except_type_serror = 0x180,
};
-static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
-{
- u64 exc_offset;
-
- switch (*vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT)) {
- case PSR_MODE_EL1t:
- exc_offset = CURRENT_EL_SP_EL0_VECTOR;
- break;
- case PSR_MODE_EL1h:
- exc_offset = CURRENT_EL_SP_ELx_VECTOR;
- break;
- case PSR_MODE_EL0t:
- exc_offset = LOWER_EL_AArch64_VECTOR;
- break;
- default:
- exc_offset = LOWER_EL_AArch32_VECTOR;
- }
-
- return vcpu_read_sys_reg(vcpu, VBAR_EL1) + exc_offset + type;
-}
-
/*
+ * This performs the exception entry at a given EL (@target_mode), stashing PC
+ * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
+ * The EL passed to this function *must* be a non-secure, privileged mode with
+ * bit 0 being set (PSTATE.SP == 1).
+ *
* When an exception is taken, most PSTATE fields are left unchanged in the
* handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
* of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
@@ -59,10 +43,35 @@ static u64 get_except_vector(struct kvm_vcpu *vcpu, enum exception_type type)
* Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
* MSB to LSB.
*/
-static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu)
+static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
+ enum exception_type type)
{
- unsigned long sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
- unsigned long old, new;
+ unsigned long sctlr, vbar, old, new, mode;
+ u64 exc_offset;
+
+ mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ if (mode == target_mode)
+ exc_offset = CURRENT_EL_SP_ELx_VECTOR;
+ else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
+ exc_offset = CURRENT_EL_SP_EL0_VECTOR;
+ else if (!(mode & PSR_MODE32_BIT))
+ exc_offset = LOWER_EL_AArch64_VECTOR;
+ else
+ exc_offset = LOWER_EL_AArch32_VECTOR;
+
+ switch (target_mode) {
+ case PSR_MODE_EL1h:
+ vbar = vcpu_read_sys_reg(vcpu, VBAR_EL1);
+ sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
+ vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
+ break;
+ default:
+ /* Don't do that */
+ BUG();
+ }
+
+ *vcpu_pc(vcpu) = vbar + exc_offset + type;
old = *vcpu_cpsr(vcpu);
new = 0;
@@ -105,9 +114,10 @@ static unsigned long get_except64_pstate(struct kvm_vcpu *vcpu)
new |= PSR_I_BIT;
new |= PSR_F_BIT;
- new |= PSR_MODE_EL1h;
+ new |= target_mode;
- return new;
+ *vcpu_cpsr(vcpu) = new;
+ vcpu_write_spsr(vcpu, old);
}
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
@@ -116,11 +126,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u32 esr = 0;
- vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
- *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
-
- *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
- vcpu_write_spsr(vcpu, cpsr);
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
@@ -148,14 +154,9 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
static void inject_undef64(struct kvm_vcpu *vcpu)
{
- unsigned long cpsr = *vcpu_cpsr(vcpu);
u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
- vcpu_write_elr_el1(vcpu, *vcpu_pc(vcpu));
- *vcpu_pc(vcpu) = get_except_vector(vcpu, except_type_sync);
-
- *vcpu_cpsr(vcpu) = get_except64_pstate(vcpu);
- vcpu_write_spsr(vcpu, cpsr);
+ enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
/*
* Build an unknown exception, depending on the instruction
diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c
new file mode 100644
index 000000000000..4e0366759726
--- /dev/null
+++ b/arch/arm64/kvm/mmio.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
+{
+ void *datap = NULL;
+ union {
+ u8 byte;
+ u16 hword;
+ u32 word;
+ u64 dword;
+ } tmp;
+
+ switch (len) {
+ case 1:
+ tmp.byte = data;
+ datap = &tmp.byte;
+ break;
+ case 2:
+ tmp.hword = data;
+ datap = &tmp.hword;
+ break;
+ case 4:
+ tmp.word = data;
+ datap = &tmp.word;
+ break;
+ case 8:
+ tmp.dword = data;
+ datap = &tmp.dword;
+ break;
+ }
+
+ memcpy(buf, datap, len);
+}
+
+unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
+{
+ unsigned long data = 0;
+ union {
+ u16 hword;
+ u32 word;
+ u64 dword;
+ } tmp;
+
+ switch (len) {
+ case 1:
+ data = *(u8 *)buf;
+ break;
+ case 2:
+ memcpy(&tmp.hword, buf, len);
+ data = tmp.hword;
+ break;
+ case 4:
+ memcpy(&tmp.word, buf, len);
+ data = tmp.word;
+ break;
+ case 8:
+ memcpy(&tmp.dword, buf, len);
+ data = tmp.dword;
+ break;
+ }
+
+ return data;
+}
+
+/**
+ * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
+ * or in-kernel IO emulation
+ *
+ * @vcpu: The VCPU pointer
+ * @run: The VCPU run struct containing the mmio data
+ */
+int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ unsigned long data;
+ unsigned int len;
+ int mask;
+
+ /* Detect an already handled MMIO return */
+ if (unlikely(!vcpu->mmio_needed))
+ return 0;
+
+ vcpu->mmio_needed = 0;
+
+ if (!kvm_vcpu_dabt_iswrite(vcpu)) {
+ len = kvm_vcpu_dabt_get_as(vcpu);
+ data = kvm_mmio_read_buf(run->mmio.data, len);
+
+ if (kvm_vcpu_dabt_issext(vcpu) &&
+ len < sizeof(unsigned long)) {
+ mask = 1U << ((len * 8) - 1);
+ data = (data ^ mask) - mask;
+ }
+
+ if (!kvm_vcpu_dabt_issf(vcpu))
+ data = data & 0xffffffff;
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
+ &data);
+ data = vcpu_data_host_to_guest(vcpu, data, len);
+ vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
+ }
+
+ /*
+ * The MMIO instruction is emulated and should not be re-executed
+ * in the guest.
+ */
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
+ return 0;
+}
+
+int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ phys_addr_t fault_ipa)
+{
+ unsigned long data;
+ unsigned long rt;
+ int ret;
+ bool is_write;
+ int len;
+ u8 data_buf[8];
+
+ /*
+ * No valid syndrome? Ask userspace for help if it has
+ * volunteered to do so, and bail out otherwise.
+ */
+ if (!kvm_vcpu_dabt_isvalid(vcpu)) {
+ if (vcpu->kvm->arch.return_nisv_io_abort_to_user) {
+ run->exit_reason = KVM_EXIT_ARM_NISV;
+ run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
+ run->arm_nisv.fault_ipa = fault_ipa;
+ return 0;
+ }
+
+ kvm_pr_unimpl("Data abort outside memslots with no valid syndrome info\n");
+ return -ENOSYS;
+ }
+
+ /* Page table accesses IO mem: tell guest to fix its TTBR */
+ if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+ kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ return 1;
+ }
+
+ /*
+ * Prepare MMIO operation. First decode the syndrome data we get
+ * from the CPU. Then try if some in-kernel emulation feels
+ * responsible, otherwise let user space do its magic.
+ */
+ is_write = kvm_vcpu_dabt_iswrite(vcpu);
+ len = kvm_vcpu_dabt_get_as(vcpu);
+ rt = kvm_vcpu_dabt_get_rd(vcpu);
+
+ if (is_write) {
+ data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
+ len);
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
+ kvm_mmio_write_buf(data_buf, len, data);
+
+ ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+ data_buf);
+ } else {
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
+ fault_ipa, NULL);
+
+ ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
+ data_buf);
+ }
+
+ /* Now prepare kvm_run for the potential return to userland. */
+ run->mmio.is_write = is_write;
+ run->mmio.phys_addr = fault_ipa;
+ run->mmio.len = len;
+ vcpu->mmio_needed = 1;
+
+ if (!ret) {
+ /* We handled the access successfully in the kernel. */
+ if (!is_write)
+ memcpy(run->mmio.data, data_buf, len);
+ vcpu->stat.mmio_exit_kernel++;
+ kvm_handle_mmio_return(vcpu, run);
+ return 1;
+ }
+
+ if (is_write)
+ memcpy(run->mmio.data, data_buf, len);
+ vcpu->stat.mmio_exit_user++;
+ run->exit_reason = KVM_EXIT_MMIO;
+ return 0;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
new file mode 100644
index 000000000000..290154e32c0b
--- /dev/null
+++ b/arch/arm64/kvm/mmu.c
@@ -0,0 +1,2612 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ */
+
+#include <linux/mman.h>
+#include <linux/kvm_host.h>
+#include <linux/io.h>
+#include <linux/hugetlb.h>
+#include <linux/sched/signal.h>
+#include <trace/events/kvm.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_ras.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/virt.h>
+
+#include "trace.h"
+
+static pgd_t *boot_hyp_pgd;
+static pgd_t *hyp_pgd;
+static pgd_t *merged_hyp_pgd;
+static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
+
+static unsigned long hyp_idmap_start;
+static unsigned long hyp_idmap_end;
+static phys_addr_t hyp_idmap_vector;
+
+static unsigned long io_map_base;
+
+#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
+
+#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
+#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
+
+static bool is_iomap(unsigned long flags)
+{
+ return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+}
+
+static bool memslot_is_logging(struct kvm_memory_slot *memslot)
+{
+ return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
+}
+
+/**
+ * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
+ * @kvm: pointer to kvm structure.
+ *
+ * Interface to HYP function to flush all VM TLB entries
+ */
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+ kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
+}
+
+static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+ kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
+}
+
+/*
+ * D-Cache management functions. They take the page table entries by
+ * value, as they are flushing the cache using the kernel mapping (or
+ * kmap on 32bit).
+ */
+static void kvm_flush_dcache_pte(pte_t pte)
+{
+ __kvm_flush_dcache_pte(pte);
+}
+
+static void kvm_flush_dcache_pmd(pmd_t pmd)
+{
+ __kvm_flush_dcache_pmd(pmd);
+}
+
+static void kvm_flush_dcache_pud(pud_t pud)
+{
+ __kvm_flush_dcache_pud(pud);
+}
+
+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+ return !pfn_valid(pfn);
+}
+
+/**
+ * stage2_dissolve_pmd() - clear and flush huge PMD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pmd: pmd pointer for IPA
+ *
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
+{
+ if (!pmd_thp_or_huge(*pmd))
+ return;
+
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pmd));
+}
+
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pud: pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+ if (!stage2_pud_huge(kvm, *pudp))
+ return;
+
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pudp));
+}
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ int min, int max)
+{
+ void *page;
+
+ BUG_ON(max > KVM_NR_MEM_OBJS);
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < max) {
+ page = (void *)__get_free_page(GFP_PGTABLE_USER);
+ if (!page)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = page;
+ }
+ return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+ void *p;
+
+ BUG_ON(!mc || !mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ return p;
+}
+
+static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
+{
+ p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
+ stage2_pgd_clear(kvm, pgd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ stage2_p4d_free(kvm, p4d_table);
+ put_page(virt_to_page(pgd));
+}
+
+static void clear_stage2_p4d_entry(struct kvm *kvm, p4d_t *p4d, phys_addr_t addr)
+{
+ pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
+ stage2_p4d_clear(kvm, p4d);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ stage2_pud_free(kvm, pud_table);
+ put_page(virt_to_page(p4d));
+}
+
+static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
+{
+ pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
+ VM_BUG_ON(stage2_pud_huge(kvm, *pud));
+ stage2_pud_clear(kvm, pud);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ stage2_pmd_free(kvm, pmd_table);
+ put_page(virt_to_page(pud));
+}
+
+static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
+{
+ pte_t *pte_table = pte_offset_kernel(pmd, 0);
+ VM_BUG_ON(pmd_thp_or_huge(*pmd));
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ free_page((unsigned long)pte_table);
+ put_page(virt_to_page(pmd));
+}
+
+static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
+{
+ WRITE_ONCE(*ptep, new_pte);
+ dsb(ishst);
+}
+
+static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
+{
+ WRITE_ONCE(*pmdp, new_pmd);
+ dsb(ishst);
+}
+
+static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
+{
+ kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
+}
+
+static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
+{
+ WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
+ dsb(ishst);
+}
+
+static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
+{
+ WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
+ dsb(ishst);
+}
+
+static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
+{
+#ifndef __PAGETABLE_P4D_FOLDED
+ WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
+ dsb(ishst);
+#endif
+}
+
+/*
+ * Unmapping vs dcache management:
+ *
+ * If a guest maps certain memory pages as uncached, all writes will
+ * bypass the data cache and go directly to RAM. However, the CPUs
+ * can still speculate reads (not writes) and fill cache lines with
+ * data.
+ *
+ * Those cache lines will be *clean* cache lines though, so a
+ * clean+invalidate operation is equivalent to an invalidate
+ * operation, because no cache lines are marked dirty.
+ *
+ * Those clean cache lines could be filled prior to an uncached write
+ * by the guest, and the cache coherent IO subsystem would therefore
+ * end up writing old data to disk.
+ *
+ * This is why right after unmapping a page/section and invalidating
+ * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
+ * the IO subsystem will never hit in the cache.
+ *
+ * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
+ * we then fully enforce cacheability of RAM, no matter what the guest
+ * does.
+ */
+static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t start_addr = addr;
+ pte_t *pte, *start_pte;
+
+ start_pte = pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ pte_t old_pte = *pte;
+
+ kvm_set_pte(pte, __pte(0));
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+ /* No need to invalidate the cache for device mappings */
+ if (!kvm_is_device_pfn(pte_pfn(old_pte)))
+ kvm_flush_dcache_pte(old_pte);
+
+ put_page(virt_to_page(pte));
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ if (stage2_pte_table_empty(kvm, start_pte))
+ clear_stage2_pmd_entry(kvm, pmd, start_addr);
+}
+
+static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
+ phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next, start_addr = addr;
+ pmd_t *pmd, *start_pmd;
+
+ start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
+ do {
+ next = stage2_pmd_addr_end(kvm, addr, end);
+ if (!pmd_none(*pmd)) {
+ if (pmd_thp_or_huge(*pmd)) {
+ pmd_t old_pmd = *pmd;
+
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+
+ kvm_flush_dcache_pmd(old_pmd);
+
+ put_page(virt_to_page(pmd));
+ } else {
+ unmap_stage2_ptes(kvm, pmd, addr, next);
+ }
+ }
+ } while (pmd++, addr = next, addr != end);
+
+ if (stage2_pmd_table_empty(kvm, start_pmd))
+ clear_stage2_pud_entry(kvm, pud, start_addr);
+}
+
+static void unmap_stage2_puds(struct kvm *kvm, p4d_t *p4d,
+ phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next, start_addr = addr;
+ pud_t *pud, *start_pud;
+
+ start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
+ do {
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud)) {
+ pud_t old_pud = *pud;
+
+ stage2_pud_clear(kvm, pud);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ kvm_flush_dcache_pud(old_pud);
+ put_page(virt_to_page(pud));
+ } else {
+ unmap_stage2_pmds(kvm, pud, addr, next);
+ }
+ }
+ } while (pud++, addr = next, addr != end);
+
+ if (stage2_pud_table_empty(kvm, start_pud))
+ clear_stage2_p4d_entry(kvm, p4d, start_addr);
+}
+
+static void unmap_stage2_p4ds(struct kvm *kvm, pgd_t *pgd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next, start_addr = addr;
+ p4d_t *p4d, *start_p4d;
+
+ start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
+ do {
+ next = stage2_p4d_addr_end(kvm, addr, end);
+ if (!stage2_p4d_none(kvm, *p4d))
+ unmap_stage2_puds(kvm, p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+
+ if (stage2_p4d_table_empty(kvm, start_p4d))
+ clear_stage2_pgd_entry(kvm, pgd, start_addr);
+}
+
+/**
+ * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
+ * @kvm: The VM pointer
+ * @start: The intermediate physical base address of the range to unmap
+ * @size: The size of the area to unmap
+ *
+ * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
+ * be called while holding mmu_lock (unless for freeing the stage2 pgd before
+ * destroying the VM), otherwise another faulting VCPU may come in and mess
+ * with things behind our backs.
+ */
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+{
+ pgd_t *pgd;
+ phys_addr_t addr = start, end = start + size;
+ phys_addr_t next;
+
+ assert_spin_locked(&kvm->mmu_lock);
+ WARN_ON(size & ~PAGE_MASK);
+
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+ do {
+ /*
+ * Make sure the page table is still active, as another thread
+ * could have possibly freed the page table, while we released
+ * the lock.
+ */
+ if (!READ_ONCE(kvm->arch.pgd))
+ break;
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (!stage2_pgd_none(kvm, *pgd))
+ unmap_stage2_p4ds(kvm, pgd, addr, next);
+ /*
+ * If the range is too large, release the kvm->mmu_lock
+ * to prevent starvation and lockup detector warnings.
+ */
+ if (next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
+ kvm_flush_dcache_pte(*pte);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pmd_t *pmd;
+ phys_addr_t next;
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
+ do {
+ next = stage2_pmd_addr_end(kvm, addr, end);
+ if (!pmd_none(*pmd)) {
+ if (pmd_thp_or_huge(*pmd))
+ kvm_flush_dcache_pmd(*pmd);
+ else
+ stage2_flush_ptes(kvm, pmd, addr, next);
+ }
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void stage2_flush_puds(struct kvm *kvm, p4d_t *p4d,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pud_t *pud;
+ phys_addr_t next;
+
+ pud = stage2_pud_offset(kvm, p4d, addr);
+ do {
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud))
+ kvm_flush_dcache_pud(*pud);
+ else
+ stage2_flush_pmds(kvm, pud, addr, next);
+ }
+ } while (pud++, addr = next, addr != end);
+}
+
+static void stage2_flush_p4ds(struct kvm *kvm, pgd_t *pgd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ p4d_t *p4d;
+ phys_addr_t next;
+
+ p4d = stage2_p4d_offset(kvm, pgd, addr);
+ do {
+ next = stage2_p4d_addr_end(kvm, addr, end);
+ if (!stage2_p4d_none(kvm, *p4d))
+ stage2_flush_puds(kvm, p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void stage2_flush_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
+ phys_addr_t next;
+ pgd_t *pgd;
+
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+ do {
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (!stage2_pgd_none(kvm, *pgd))
+ stage2_flush_p4ds(kvm, pgd, addr, next);
+
+ if (next != end)
+ cond_resched_lock(&kvm->mmu_lock);
+ } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the stage 2 page tables and invalidate any cache lines
+ * backing memory already mapped to the VM.
+ */
+static void stage2_flush_vm(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ spin_lock(&kvm->mmu_lock);
+
+ slots = kvm_memslots(kvm);
+ kvm_for_each_memslot(memslot, slots)
+ stage2_flush_memslot(kvm, memslot);
+
+ spin_unlock(&kvm->mmu_lock);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+static void clear_hyp_pgd_entry(pgd_t *pgd)
+{
+ p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
+ pgd_clear(pgd);
+ p4d_free(NULL, p4d_table);
+ put_page(virt_to_page(pgd));
+}
+
+static void clear_hyp_p4d_entry(p4d_t *p4d)
+{
+ pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
+ VM_BUG_ON(p4d_huge(*p4d));
+ p4d_clear(p4d);
+ pud_free(NULL, pud_table);
+ put_page(virt_to_page(p4d));
+}
+
+static void clear_hyp_pud_entry(pud_t *pud)
+{
+ pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
+ VM_BUG_ON(pud_huge(*pud));
+ pud_clear(pud);
+ pmd_free(NULL, pmd_table);
+ put_page(virt_to_page(pud));
+}
+
+static void clear_hyp_pmd_entry(pmd_t *pmd)
+{
+ pte_t *pte_table = pte_offset_kernel(pmd, 0);
+ VM_BUG_ON(pmd_thp_or_huge(*pmd));
+ pmd_clear(pmd);
+ pte_free_kernel(NULL, pte_table);
+ put_page(virt_to_page(pmd));
+}
+
+static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+ pte_t *pte, *start_pte;
+
+ start_pte = pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ kvm_set_pte(pte, __pte(0));
+ put_page(virt_to_page(pte));
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+
+ if (hyp_pte_table_empty(start_pte))
+ clear_hyp_pmd_entry(pmd);
+}
+
+static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next;
+ pmd_t *pmd, *start_pmd;
+
+ start_pmd = pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ /* Hyp doesn't use huge pmds */
+ if (!pmd_none(*pmd))
+ unmap_hyp_ptes(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+
+ if (hyp_pmd_table_empty(start_pmd))
+ clear_hyp_pud_entry(pud);
+}
+
+static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next;
+ pud_t *pud, *start_pud;
+
+ start_pud = pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ /* Hyp doesn't use huge puds */
+ if (!pud_none(*pud))
+ unmap_hyp_pmds(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+
+ if (hyp_pud_table_empty(start_pud))
+ clear_hyp_p4d_entry(p4d);
+}
+
+static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
+{
+ phys_addr_t next;
+ p4d_t *p4d, *start_p4d;
+
+ start_p4d = p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ /* Hyp doesn't use huge p4ds */
+ if (!p4d_none(*p4d))
+ unmap_hyp_puds(p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+
+ if (hyp_p4d_table_empty(start_p4d))
+ clear_hyp_pgd_entry(pgd);
+}
+
+static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
+{
+ return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
+}
+
+static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+ phys_addr_t start, u64 size)
+{
+ pgd_t *pgd;
+ phys_addr_t addr = start, end = start + size;
+ phys_addr_t next;
+
+ /*
+ * We don't unmap anything from HYP, except at the hyp tear down.
+ * Hence, we don't have to invalidate the TLBs here.
+ */
+ pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (!pgd_none(*pgd))
+ unmap_hyp_p4ds(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+ __unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
+}
+
+static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
+{
+ __unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
+}
+
+/**
+ * free_hyp_pgds - free Hyp-mode page tables
+ *
+ * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
+ * therefore contains either mappings in the kernel memory area (above
+ * PAGE_OFFSET), or device mappings in the idmap range.
+ *
+ * boot_hyp_pgd should only map the idmap range, and is only used in
+ * the extended idmap case.
+ */
+void free_hyp_pgds(void)
+{
+ pgd_t *id_pgd;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+
+ id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
+
+ if (id_pgd) {
+ /* In case we never called hyp_mmu_init() */
+ if (!io_map_base)
+ io_map_base = hyp_idmap_start;
+ unmap_hyp_idmap_range(id_pgd, io_map_base,
+ hyp_idmap_start + PAGE_SIZE - io_map_base);
+ }
+
+ if (boot_hyp_pgd) {
+ free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+ boot_hyp_pgd = NULL;
+ }
+
+ if (hyp_pgd) {
+ unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
+ (uintptr_t)high_memory - PAGE_OFFSET);
+
+ free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
+ hyp_pgd = NULL;
+ }
+ if (merged_hyp_pgd) {
+ clear_page(merged_hyp_pgd);
+ free_page((unsigned long)merged_hyp_pgd);
+ merged_hyp_pgd = NULL;
+ }
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+}
+
+static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ pte_t *pte;
+ unsigned long addr;
+
+ addr = start;
+ do {
+ pte = pte_offset_kernel(pmd, addr);
+ kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
+ get_page(virt_to_page(pte));
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+}
+
+static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned long addr, next;
+
+ addr = start;
+ do {
+ pmd = pmd_offset(pud, addr);
+
+ BUG_ON(pmd_sect(*pmd));
+
+ if (pmd_none(*pmd)) {
+ pte = pte_alloc_one_kernel(NULL);
+ if (!pte) {
+ kvm_err("Cannot allocate Hyp pte\n");
+ return -ENOMEM;
+ }
+ kvm_pmd_populate(pmd, pte);
+ get_page(virt_to_page(pmd));
+ }
+
+ next = pmd_addr_end(addr, end);
+
+ create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
+ pfn += (next - addr) >> PAGE_SHIFT;
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+
+static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ unsigned long addr, next;
+ int ret;
+
+ addr = start;
+ do {
+ pud = pud_offset(p4d, addr);
+
+ if (pud_none_or_clear_bad(pud)) {
+ pmd = pmd_alloc_one(NULL, addr);
+ if (!pmd) {
+ kvm_err("Cannot allocate Hyp pmd\n");
+ return -ENOMEM;
+ }
+ kvm_pud_populate(pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ next = pud_addr_end(addr, end);
+ ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
+ if (ret)
+ return ret;
+ pfn += (next - addr) >> PAGE_SHIFT;
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+
+static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
+ unsigned long end, unsigned long pfn,
+ pgprot_t prot)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ unsigned long addr, next;
+ int ret;
+
+ addr = start;
+ do {
+ p4d = p4d_offset(pgd, addr);
+
+ if (p4d_none(*p4d)) {
+ pud = pud_alloc_one(NULL, addr);
+ if (!pud) {
+ kvm_err("Cannot allocate Hyp pud\n");
+ return -ENOMEM;
+ }
+ kvm_p4d_populate(p4d, pud);
+ get_page(virt_to_page(p4d));
+ }
+
+ next = p4d_addr_end(addr, end);
+ ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
+ if (ret)
+ return ret;
+ pfn += (next - addr) >> PAGE_SHIFT;
+ } while (addr = next, addr != end);
+
+ return 0;
+}
+
+static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
+ unsigned long start, unsigned long end,
+ unsigned long pfn, pgprot_t prot)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+ unsigned long addr, next;
+ int err = 0;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+ addr = start & PAGE_MASK;
+ end = PAGE_ALIGN(end);
+ do {
+ pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
+
+ if (pgd_none(*pgd)) {
+ p4d = p4d_alloc_one(NULL, addr);
+ if (!p4d) {
+ kvm_err("Cannot allocate Hyp p4d\n");
+ err = -ENOMEM;
+ goto out;
+ }
+ kvm_pgd_populate(pgd, p4d);
+ get_page(virt_to_page(pgd));
+ }
+
+ next = pgd_addr_end(addr, end);
+ err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
+ if (err)
+ goto out;
+ pfn += (next - addr) >> PAGE_SHIFT;
+ } while (addr = next, addr != end);
+out:
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+ return err;
+}
+
+static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
+{
+ if (!is_vmalloc_addr(kaddr)) {
+ BUG_ON(!virt_addr_valid(kaddr));
+ return __pa(kaddr);
+ } else {
+ return page_to_phys(vmalloc_to_page(kaddr)) +
+ offset_in_page(kaddr);
+ }
+}
+
+/**
+ * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
+ * @from: The virtual kernel start address of the range
+ * @to: The virtual kernel end address of the range (exclusive)
+ * @prot: The protection to be applied to this range
+ *
+ * The same virtual address as the kernel virtual address is also used
+ * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
+ * physical pages.
+ */
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+{
+ phys_addr_t phys_addr;
+ unsigned long virt_addr;
+ unsigned long start = kern_hyp_va((unsigned long)from);
+ unsigned long end = kern_hyp_va((unsigned long)to);
+
+ if (is_kernel_in_hyp_mode())
+ return 0;
+
+ start = start & PAGE_MASK;
+ end = PAGE_ALIGN(end);
+
+ for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
+ int err;
+
+ phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
+ err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
+ virt_addr, virt_addr + PAGE_SIZE,
+ __phys_to_pfn(phys_addr),
+ prot);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
+ unsigned long *haddr, pgprot_t prot)
+{
+ pgd_t *pgd = hyp_pgd;
+ unsigned long base;
+ int ret = 0;
+
+ mutex_lock(&kvm_hyp_pgd_mutex);
+
+ /*
+ * This assumes that we have enough space below the idmap
+ * page to allocate our VAs. If not, the check below will
+ * kick. A potential alternative would be to detect that
+ * overflow and switch to an allocation above the idmap.
+ *
+ * The allocated size is always a multiple of PAGE_SIZE.
+ */
+ size = PAGE_ALIGN(size + offset_in_page(phys_addr));
+ base = io_map_base - size;
+
+ /*
+ * Verify that BIT(VA_BITS - 1) hasn't been flipped by
+ * allocating the new area, as it would indicate we've
+ * overflowed the idmap/IO address range.
+ */
+ if ((base ^ io_map_base) & BIT(VA_BITS - 1))
+ ret = -ENOMEM;
+ else
+ io_map_base = base;
+
+ mutex_unlock(&kvm_hyp_pgd_mutex);
+
+ if (ret)
+ goto out;
+
+ if (__kvm_cpu_uses_extended_idmap())
+ pgd = boot_hyp_pgd;
+
+ ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+ base, base + size,
+ __phys_to_pfn(phys_addr), prot);
+ if (ret)
+ goto out;
+
+ *haddr = base + offset_in_page(phys_addr);
+
+out:
+ return ret;
+}
+
+/**
+ * create_hyp_io_mappings - Map IO into both kernel and HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size: Size of the region being mapped
+ * @kaddr: Kernel VA for this mapping
+ * @haddr: HYP VA for this mapping
+ */
+int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
+ void __iomem **kaddr,
+ void __iomem **haddr)
+{
+ unsigned long addr;
+ int ret;
+
+ *kaddr = ioremap(phys_addr, size);
+ if (!*kaddr)
+ return -ENOMEM;
+
+ if (is_kernel_in_hyp_mode()) {
+ *haddr = *kaddr;
+ return 0;
+ }
+
+ ret = __create_hyp_private_mapping(phys_addr, size,
+ &addr, PAGE_HYP_DEVICE);
+ if (ret) {
+ iounmap(*kaddr);
+ *kaddr = NULL;
+ *haddr = NULL;
+ return ret;
+ }
+
+ *haddr = (void __iomem *)addr;
+ return 0;
+}
+
+/**
+ * create_hyp_exec_mappings - Map an executable range into HYP
+ * @phys_addr: The physical start address which gets mapped
+ * @size: Size of the region being mapped
+ * @haddr: HYP VA for this mapping
+ */
+int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
+ void **haddr)
+{
+ unsigned long addr;
+ int ret;
+
+ BUG_ON(is_kernel_in_hyp_mode());
+
+ ret = __create_hyp_private_mapping(phys_addr, size,
+ &addr, PAGE_HYP_EXEC);
+ if (ret) {
+ *haddr = NULL;
+ return ret;
+ }
+
+ *haddr = (void *)addr;
+ return 0;
+}
+
+/**
+ * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
+ * @kvm: The KVM struct pointer for the VM.
+ *
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
+ *
+ * Note we don't need locking here as this is only called when the VM is
+ * created, which can only be done once.
+ */
+int kvm_alloc_stage2_pgd(struct kvm *kvm)
+{
+ phys_addr_t pgd_phys;
+ pgd_t *pgd;
+
+ if (kvm->arch.pgd != NULL) {
+ kvm_err("kvm_arch already initialized?\n");
+ return -EINVAL;
+ }
+
+ /* Allocate the HW PGD, making sure that each page gets its own refcount */
+ pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
+ if (!pgd)
+ return -ENOMEM;
+
+ pgd_phys = virt_to_phys(pgd);
+ if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
+ return -EINVAL;
+
+ kvm->arch.pgd = pgd;
+ kvm->arch.pgd_phys = pgd_phys;
+ return 0;
+}
+
+static void stage2_unmap_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ hva_t hva = memslot->userspace_addr;
+ phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+ phys_addr_t size = PAGE_SIZE * memslot->npages;
+ hva_t reg_end = hva + size;
+
+ /*
+ * A memory region could potentially cover multiple VMAs, and any holes
+ * between them, so iterate over all of them to find out if we should
+ * unmap any of them.
+ *
+ * +--------------------------------------------+
+ * +---------------+----------------+ +----------------+
+ * | : VMA 1 | VMA 2 | | VMA 3 : |
+ * +---------------+----------------+ +----------------+
+ * | memory region |
+ * +--------------------------------------------+
+ */
+ do {
+ struct vm_area_struct *vma = find_vma(current->mm, hva);
+ hva_t vm_start, vm_end;
+
+ if (!vma || vma->vm_start >= reg_end)
+ break;
+
+ /*
+ * Take the intersection of this VMA with the memory region
+ */
+ vm_start = max(hva, vma->vm_start);
+ vm_end = min(reg_end, vma->vm_end);
+
+ if (!(vma->vm_flags & VM_PFNMAP)) {
+ gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+ unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+ }
+ hva = vm_end;
+ } while (hva < reg_end);
+}
+
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any regular RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ down_read(&current->mm->mmap_sem);
+ spin_lock(&kvm->mmu_lock);
+
+ slots = kvm_memslots(kvm);
+ kvm_for_each_memslot(memslot, slots)
+ stage2_unmap_memslot(kvm, memslot);
+
+ spin_unlock(&kvm->mmu_lock);
+ up_read(&current->mm->mmap_sem);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+/**
+ * kvm_free_stage2_pgd - free all stage-2 tables
+ * @kvm: The KVM struct pointer for the VM.
+ *
+ * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
+ * underlying level-2 and level-3 tables before freeing the actual level-1 table
+ * and setting the struct pointer to NULL.
+ */
+void kvm_free_stage2_pgd(struct kvm *kvm)
+{
+ void *pgd = NULL;
+
+ spin_lock(&kvm->mmu_lock);
+ if (kvm->arch.pgd) {
+ unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
+ pgd = READ_ONCE(kvm->arch.pgd);
+ kvm->arch.pgd = NULL;
+ kvm->arch.pgd_phys = 0;
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ /* Free the HW pgd, one page at a time */
+ if (pgd)
+ free_pages_exact(pgd, stage2_pgd_size(kvm));
+}
+
+static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+ if (stage2_pgd_none(kvm, *pgd)) {
+ if (!cache)
+ return NULL;
+ p4d = mmu_memory_cache_alloc(cache);
+ stage2_pgd_populate(kvm, pgd, p4d);
+ get_page(virt_to_page(pgd));
+ }
+
+ return stage2_p4d_offset(kvm, pgd, addr);
+}
+
+static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = stage2_get_p4d(kvm, cache, addr);
+ if (stage2_p4d_none(kvm, *p4d)) {
+ if (!cache)
+ return NULL;
+ pud = mmu_memory_cache_alloc(cache);
+ stage2_p4d_populate(kvm, p4d, pud);
+ get_page(virt_to_page(p4d));
+ }
+
+ return stage2_pud_offset(kvm, p4d, addr);
+}
+
+static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = stage2_get_pud(kvm, cache, addr);
+ if (!pud || stage2_pud_huge(kvm, *pud))
+ return NULL;
+
+ if (stage2_pud_none(kvm, *pud)) {
+ if (!cache)
+ return NULL;
+ pmd = mmu_memory_cache_alloc(cache);
+ stage2_pud_populate(kvm, pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ return stage2_pmd_offset(kvm, pud, addr);
+}
+
+static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
+ *cache, phys_addr_t addr, const pmd_t *new_pmd)
+{
+ pmd_t *pmd, old_pmd;
+
+retry:
+ pmd = stage2_get_pmd(kvm, cache, addr);
+ VM_BUG_ON(!pmd);
+
+ old_pmd = *pmd;
+ /*
+ * Multiple vcpus faulting on the same PMD entry, can
+ * lead to them sequentially updating the PMD with the
+ * same value. Following the break-before-make
+ * (pmd_clear() followed by tlb_flush()) process can
+ * hinder forward progress due to refaults generated
+ * on missing translations.
+ *
+ * Skip updating the page table if the entry is
+ * unchanged.
+ */
+ if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+ return 0;
+
+ if (pmd_present(old_pmd)) {
+ /*
+ * If we already have PTE level mapping for this block,
+ * we must unmap it to avoid inconsistent TLB state and
+ * leaking the table page. We could end up in this situation
+ * if the memory slot was marked for dirty logging and was
+ * reverted, leaving PTE level mappings for the pages accessed
+ * during the period. So, unmap the PTE level mapping for this
+ * block and retry, as we could have released the upper level
+ * table in the process.
+ *
+ * Normal THP split/merge follows mmu_notifier callbacks and do
+ * get handled accordingly.
+ */
+ if (!pmd_thp_or_huge(old_pmd)) {
+ unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+ goto retry;
+ }
+ /*
+ * Mapping in huge pages should only happen through a
+ * fault. If a page is merged into a transparent huge
+ * page, the individual subpages of that huge page
+ * should be unmapped through MMU notifiers before we
+ * get here.
+ *
+ * Merging of CompoundPages is not supported; they
+ * should become splitting first, unmapped, merged,
+ * and mapped back in on-demand.
+ */
+ WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
+ pmd_clear(pmd);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pmd));
+ }
+
+ kvm_set_pmd(pmd, *new_pmd);
+ return 0;
+}
+
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr, const pud_t *new_pudp)
+{
+ pud_t *pudp, old_pud;
+
+retry:
+ pudp = stage2_get_pud(kvm, cache, addr);
+ VM_BUG_ON(!pudp);
+
+ old_pud = *pudp;
+
+ /*
+ * A large number of vcpus faulting on the same stage 2 entry,
+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+ * Skip updating the page tables if there is no change.
+ */
+ if (pud_val(old_pud) == pud_val(*new_pudp))
+ return 0;
+
+ if (stage2_pud_present(kvm, old_pud)) {
+ /*
+ * If we already have table level mapping for this block, unmap
+ * the range for this block and retry.
+ */
+ if (!stage2_pud_huge(kvm, old_pud)) {
+ unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+ goto retry;
+ }
+
+ WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pudp));
+ }
+
+ kvm_set_pud(pudp, *new_pudp);
+ return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+ pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ *pudpp = NULL;
+ *pmdpp = NULL;
+ *ptepp = NULL;
+
+ pudp = stage2_get_pud(kvm, NULL, addr);
+ if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+ return false;
+
+ if (stage2_pud_huge(kvm, *pudp)) {
+ *pudpp = pudp;
+ return true;
+ }
+
+ pmdp = stage2_pmd_offset(kvm, pudp, addr);
+ if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
+ return false;
+
+ if (pmd_thp_or_huge(*pmdp)) {
+ *pmdpp = pmdp;
+ return true;
+ }
+
+ ptep = pte_offset_kernel(pmdp, addr);
+ if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
+ return false;
+
+ *ptepp = ptep;
+ return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ bool found;
+
+ found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+ if (!found)
+ return false;
+
+ if (pudp)
+ return kvm_s2pud_exec(pudp);
+ else if (pmdp)
+ return kvm_s2pmd_exec(pmdp);
+ else
+ return kvm_s2pte_exec(ptep);
+}
+
+static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr, const pte_t *new_pte,
+ unsigned long flags)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, old_pte;
+ bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
+ bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
+
+ VM_BUG_ON(logging_active && !cache);
+
+ /* Create stage-2 page table mapping - Levels 0 and 1 */
+ pud = stage2_get_pud(kvm, cache, addr);
+ if (!pud) {
+ /*
+ * Ignore calls from kvm_set_spte_hva for unallocated
+ * address ranges.
+ */
+ return 0;
+ }
+
+ /*
+ * While dirty page logging - dissolve huge PUD, then continue
+ * on to allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pud(kvm, addr, pud);
+
+ if (stage2_pud_none(kvm, *pud)) {
+ if (!cache)
+ return 0; /* ignore calls from kvm_set_spte_hva */
+ pmd = mmu_memory_cache_alloc(cache);
+ stage2_pud_populate(kvm, pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
+ if (!pmd) {
+ /*
+ * Ignore calls from kvm_set_spte_hva for unallocated
+ * address ranges.
+ */
+ return 0;
+ }
+
+ /*
+ * While dirty page logging - dissolve huge PMD, then continue on to
+ * allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pmd(kvm, addr, pmd);
+
+ /* Create stage-2 page mappings - Level 2 */
+ if (pmd_none(*pmd)) {
+ if (!cache)
+ return 0; /* ignore calls from kvm_set_spte_hva */
+ pte = mmu_memory_cache_alloc(cache);
+ kvm_pmd_populate(pmd, pte);
+ get_page(virt_to_page(pmd));
+ }
+
+ pte = pte_offset_kernel(pmd, addr);
+
+ if (iomap && pte_present(*pte))
+ return -EFAULT;
+
+ /* Create 2nd stage page table mapping - Level 3 */
+ old_pte = *pte;
+ if (pte_present(old_pte)) {
+ /* Skip page table update if there is no change */
+ if (pte_val(old_pte) == pte_val(*new_pte))
+ return 0;
+
+ kvm_set_pte(pte, __pte(0));
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pte));
+ }
+
+ kvm_set_pte(pte, *new_pte);
+ return 0;
+}
+
+#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+ if (pte_young(*pte)) {
+ *pte = pte_mkold(*pte);
+ return 1;
+ }
+ return 0;
+}
+#else
+static int stage2_ptep_test_and_clear_young(pte_t *pte)
+{
+ return __ptep_test_and_clear_young(pte);
+}
+#endif
+
+static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
+{
+ return stage2_ptep_test_and_clear_young((pte_t *)pmd);
+}
+
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+ return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
+/**
+ * kvm_phys_addr_ioremap - map a device range to guest IPA
+ *
+ * @kvm: The KVM pointer
+ * @guest_ipa: The IPA at which to insert the mapping
+ * @pa: The physical address of the device
+ * @size: The size of the mapping
+ */
+int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
+ phys_addr_t pa, unsigned long size, bool writable)
+{
+ phys_addr_t addr, end;
+ int ret = 0;
+ unsigned long pfn;
+ struct kvm_mmu_memory_cache cache = { 0, };
+
+ end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
+ pfn = __phys_to_pfn(pa);
+
+ for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
+ pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
+
+ if (writable)
+ pte = kvm_s2pte_mkwrite(pte);
+
+ ret = mmu_topup_memory_cache(&cache,
+ kvm_mmu_cache_min_pages(kvm),
+ KVM_NR_MEM_OBJS);
+ if (ret)
+ goto out;
+ spin_lock(&kvm->mmu_lock);
+ ret = stage2_set_pte(kvm, &cache, addr, &pte,
+ KVM_S2PTE_FLAG_IS_IOMAP);
+ spin_unlock(&kvm->mmu_lock);
+ if (ret)
+ goto out;
+
+ pfn++;
+ }
+
+out:
+ mmu_free_memory_cache(&cache);
+ return ret;
+}
+
+/**
+ * stage2_wp_ptes - write protect PMD range
+ * @pmd: pointer to pmd entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
+{
+ pte_t *pte;
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ if (!pte_none(*pte)) {
+ if (!kvm_s2pte_readonly(pte))
+ kvm_set_s2pte_readonly(pte);
+ }
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+/**
+ * stage2_wp_pmds - write protect PUD range
+ * kvm: kvm instance for the VM
+ * @pud: pointer to pud entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pmd_t *pmd;
+ phys_addr_t next;
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
+
+ do {
+ next = stage2_pmd_addr_end(kvm, addr, end);
+ if (!pmd_none(*pmd)) {
+ if (pmd_thp_or_huge(*pmd)) {
+ if (!kvm_s2pmd_readonly(pmd))
+ kvm_set_s2pmd_readonly(pmd);
+ } else {
+ stage2_wp_ptes(pmd, addr, next);
+ }
+ }
+ } while (pmd++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_puds - write protect P4D range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_puds(struct kvm *kvm, p4d_t *p4d,
+ phys_addr_t addr, phys_addr_t end)
+{
+ pud_t *pud;
+ phys_addr_t next;
+
+ pud = stage2_pud_offset(kvm, p4d, addr);
+ do {
+ next = stage2_pud_addr_end(kvm, addr, end);
+ if (!stage2_pud_none(kvm, *pud)) {
+ if (stage2_pud_huge(kvm, *pud)) {
+ if (!kvm_s2pud_readonly(pud))
+ kvm_set_s2pud_readonly(pud);
+ } else {
+ stage2_wp_pmds(kvm, pud, addr, next);
+ }
+ }
+ } while (pud++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_p4ds - write protect PGD range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ */
+static void stage2_wp_p4ds(struct kvm *kvm, pgd_t *pgd,
+ phys_addr_t addr, phys_addr_t end)
+{
+ p4d_t *p4d;
+ phys_addr_t next;
+
+ p4d = stage2_p4d_offset(kvm, pgd, addr);
+ do {
+ next = stage2_p4d_addr_end(kvm, addr, end);
+ if (!stage2_p4d_none(kvm, *p4d))
+ stage2_wp_puds(kvm, p4d, addr, next);
+ } while (p4d++, addr = next, addr != end);
+}
+
+/**
+ * stage2_wp_range() - write protect stage2 memory region range
+ * @kvm: The KVM pointer
+ * @addr: Start address of range
+ * @end: End address of range
+ */
+static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ pgd_t *pgd;
+ phys_addr_t next;
+
+ pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr);
+ do {
+ /*
+ * Release kvm_mmu_lock periodically if the memory region is
+ * large. Otherwise, we may see kernel panics with
+ * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
+ * CONFIG_LOCKDEP. Additionally, holding the lock too long
+ * will also starve other vCPUs. We have to also make sure
+ * that the page tables are not freed while we released
+ * the lock.
+ */
+ cond_resched_lock(&kvm->mmu_lock);
+ if (!READ_ONCE(kvm->arch.pgd))
+ break;
+ next = stage2_pgd_addr_end(kvm, addr, end);
+ if (stage2_pgd_present(kvm, *pgd))
+ stage2_wp_p4ds(kvm, pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
+
+/**
+ * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
+ * @kvm: The KVM pointer
+ * @slot: The memory slot to write protect
+ *
+ * Called to start logging dirty pages after memory region
+ * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
+ * all present PUD, PMD and PTEs are write protected in the memory region.
+ * Afterwards read of dirty page log can be called.
+ *
+ * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
+ * serializing operations for VM memory regions.
+ */
+void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
+{
+ struct kvm_memslots *slots = kvm_memslots(kvm);
+ struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
+ phys_addr_t start, end;
+
+ if (WARN_ON_ONCE(!memslot))
+ return;
+
+ start = memslot->base_gfn << PAGE_SHIFT;
+ end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ stage2_wp_range(kvm, start, end);
+ spin_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs(kvm);
+}
+
+/**
+ * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
+ * slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire kvm_mmu_lock.
+ */
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
+ phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
+ phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
+
+ stage2_wp_range(kvm, start, end);
+}
+
+/*
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * dirty pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
+static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+ __clean_dcache_guest_page(pfn, size);
+}
+
+static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size)
+{
+ __invalidate_icache_guest_page(pfn, size);
+}
+
+static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
+{
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
+}
+
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+ unsigned long hva,
+ unsigned long map_size)
+{
+ gpa_t gpa_start;
+ hva_t uaddr_start, uaddr_end;
+ size_t size;
+
+ /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
+ if (map_size == PAGE_SIZE)
+ return true;
+
+ size = memslot->npages * PAGE_SIZE;
+
+ gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+ uaddr_start = memslot->userspace_addr;
+ uaddr_end = uaddr_start + size;
+
+ /*
+ * Pages belonging to memslots that don't have the same alignment
+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SHIFT:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 blocks, we'll end up with this incorrect
+ * mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+ return false;
+
+ /*
+ * Next, let's make sure we're not trying to map anything not covered
+ * by the memslot. This means we have to prohibit block size mappings
+ * for the beginning and end of a non-block aligned and non-block sized
+ * memory slot (illustrated by the head and tail parts of the
+ * userspace view above containing pages 'abcde' and 'xyz',
+ * respectively).
+ *
+ * Note that it doesn't matter if we do the check using the
+ * userspace_addr or the base_gfn, as both are equally aligned (per
+ * the check above) and equally sized.
+ */
+ return (hva & ~(map_size - 1)) >= uaddr_start &&
+ (hva & ~(map_size - 1)) + map_size <= uaddr_end;
+}
+
+/*
+ * Check if the given hva is backed by a transparent huge page (THP) and
+ * whether it can be mapped using block mapping in stage2. If so, adjust
+ * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
+ * supported. This will need to be updated to support other THP sizes.
+ *
+ * Returns the size of the mapping.
+ */
+static unsigned long
+transparent_hugepage_adjust(struct kvm_memory_slot *memslot,
+ unsigned long hva, kvm_pfn_t *pfnp,
+ phys_addr_t *ipap)
+{
+ kvm_pfn_t pfn = *pfnp;
+
+ /*
+ * Make sure the adjustment is done only for THP pages. Also make
+ * sure that the HVA and IPA are sufficiently aligned and that the
+ * block map is contained within the memslot.
+ */
+ if (kvm_is_transparent_hugepage(pfn) &&
+ fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+ /*
+ * The address we faulted on is backed by a transparent huge
+ * page. However, because we map the compound huge page and
+ * not the individual tail page, we need to transfer the
+ * refcount to the head page. We have to be careful that the
+ * THP doesn't start to split while we are adjusting the
+ * refcounts.
+ *
+ * We are sure this doesn't happen, because mmu_notifier_retry
+ * was successful and we are holding the mmu_lock, so if this
+ * THP is trying to split, it will be blocked in the mmu
+ * notifier before touching any of the pages, specifically
+ * before being able to call __split_huge_page_refcount().
+ *
+ * We can therefore safely transfer the refcount from PG_tail
+ * to PG_head and switch the pfn from a tail page to the head
+ * page accordingly.
+ */
+ *ipap &= PMD_MASK;
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~(PTRS_PER_PMD - 1);
+ kvm_get_pfn(pfn);
+ *pfnp = pfn;
+
+ return PMD_SIZE;
+ }
+
+ /* Use page mapping if we cannot use block mapping. */
+ return PAGE_SIZE;
+}
+
+static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
+ struct kvm_memory_slot *memslot, unsigned long hva,
+ unsigned long fault_status)
+{
+ int ret;
+ bool write_fault, writable, force_pte = false;
+ bool exec_fault, needs_exec;
+ unsigned long mmu_seq;
+ gfn_t gfn = fault_ipa >> PAGE_SHIFT;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+ struct vm_area_struct *vma;
+ short vma_shift;
+ kvm_pfn_t pfn;
+ pgprot_t mem_type = PAGE_S2;
+ bool logging_active = memslot_is_logging(memslot);
+ unsigned long vma_pagesize, flags = 0;
+
+ write_fault = kvm_is_write_fault(vcpu);
+ exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+ VM_BUG_ON(write_fault && exec_fault);
+
+ if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
+ kvm_err("Unexpected L2 read permission error\n");
+ return -EFAULT;
+ }
+
+ /* Let's check if we will get back a huge page backed by hugetlbfs */
+ down_read(&current->mm->mmap_sem);
+ vma = find_vma_intersection(current->mm, hva, hva + 1);
+ if (unlikely(!vma)) {
+ kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+ up_read(&current->mm->mmap_sem);
+ return -EFAULT;
+ }
+
+ if (is_vm_hugetlb_page(vma))
+ vma_shift = huge_page_shift(hstate_vma(vma));
+ else
+ vma_shift = PAGE_SHIFT;
+
+ vma_pagesize = 1ULL << vma_shift;
+ if (logging_active ||
+ (vma->vm_flags & VM_PFNMAP) ||
+ !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ force_pte = true;
+ vma_pagesize = PAGE_SIZE;
+ }
+
+ /*
+ * The stage2 has a minimum of 2 level table (For arm64 see
+ * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+ * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+ * As for PUD huge maps, we must make sure that we have at least
+ * 3 levels, i.e, PMD is not folded.
+ */
+ if (vma_pagesize == PMD_SIZE ||
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
+ gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+ up_read(&current->mm->mmap_sem);
+
+ /* We need minimum second+third level pages */
+ ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm),
+ KVM_NR_MEM_OBJS);
+ if (ret)
+ return ret;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ /*
+ * Ensure the read of mmu_notifier_seq happens before we call
+ * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
+ * the page we just got a reference to gets unmapped before we have a
+ * chance to grab the mmu_lock, which ensure that if the page gets
+ * unmapped afterwards, the call to kvm_unmap_hva will take it away
+ * from us again properly. This smp_rmb() interacts with the smp_wmb()
+ * in kvm_mmu_notifier_invalidate_<page|range_end>.
+ */
+ smp_rmb();
+
+ pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
+ if (pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(hva, vma_shift);
+ return 0;
+ }
+ if (is_error_noslot_pfn(pfn))
+ return -EFAULT;
+
+ if (kvm_is_device_pfn(pfn)) {
+ mem_type = PAGE_S2_DEVICE;
+ flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+ } else if (logging_active) {
+ /*
+ * Faults on pages in a memslot with logging enabled
+ * should not be mapped with huge pages (it introduces churn
+ * and performance degradation), so force a pte mapping.
+ */
+ flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
+
+ /*
+ * Only actually map the page as writable if this was a write
+ * fault.
+ */
+ if (!write_fault)
+ writable = false;
+ }
+
+ if (exec_fault && is_iomap(flags))
+ return -ENOEXEC;
+
+ spin_lock(&kvm->mmu_lock);
+ if (mmu_notifier_retry(kvm, mmu_seq))
+ goto out_unlock;
+
+ /*
+ * If we are not forced to use page mapping, check if we are
+ * backed by a THP and thus use block mapping if possible.
+ */
+ if (vma_pagesize == PAGE_SIZE && !force_pte)
+ vma_pagesize = transparent_hugepage_adjust(memslot, hva,
+ &pfn, &fault_ipa);
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
+
+ if (fault_status != FSC_PERM && !is_iomap(flags))
+ clean_dcache_guest_page(pfn, vma_pagesize);
+
+ if (exec_fault)
+ invalidate_icache_guest_page(pfn, vma_pagesize);
+
+ /*
+ * If we took an execution fault we have made the
+ * icache/dcache coherent above and should now let the s2
+ * mapping be executable.
+ *
+ * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+ * execute permissions, and we preserve whatever we have.
+ */
+ needs_exec = exec_fault ||
+ (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+
+ if (vma_pagesize == PUD_SIZE) {
+ pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
+
+ new_pud = kvm_pud_mkhuge(new_pud);
+ if (writable)
+ new_pud = kvm_s2pud_mkwrite(new_pud);
+
+ if (needs_exec)
+ new_pud = kvm_s2pud_mkexec(new_pud);
+
+ ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+ } else if (vma_pagesize == PMD_SIZE) {
+ pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+ new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+ if (writable)
+ new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+
+ if (needs_exec)
+ new_pmd = kvm_s2pmd_mkexec(new_pmd);
+
+ ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
+ } else {
+ pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
+
+ if (writable) {
+ new_pte = kvm_s2pte_mkwrite(new_pte);
+ mark_page_dirty(kvm, gfn);
+ }
+
+ if (needs_exec)
+ new_pte = kvm_s2pte_mkexec(new_pte);
+
+ ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
+ }
+
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
+ kvm_set_pfn_accessed(pfn);
+ kvm_release_pfn_clean(pfn);
+ return ret;
+}
+
+/*
+ * Resolve the access fault by making the page young again.
+ * Note that because the faulting entry is guaranteed not to be
+ * cached in the TLB, we don't need to invalidate anything.
+ * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
+ * so there is no need for atomic (pte|pmd)_mkyoung operations.
+ */
+static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ kvm_pfn_t pfn;
+ bool pfn_valid = false;
+
+ trace_kvm_access_fault(fault_ipa);
+
+ spin_lock(&vcpu->kvm->mmu_lock);
+
+ if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
+ goto out;
+
+ if (pud) { /* HugeTLB */
+ *pud = kvm_s2pud_mkyoung(*pud);
+ pfn = kvm_pud_pfn(*pud);
+ pfn_valid = true;
+ } else if (pmd) { /* THP, HugeTLB */
+ *pmd = pmd_mkyoung(*pmd);
+ pfn = pmd_pfn(*pmd);
+ pfn_valid = true;
+ } else {
+ *pte = pte_mkyoung(*pte); /* Just a page... */
+ pfn = pte_pfn(*pte);
+ pfn_valid = true;
+ }
+
+out:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (pfn_valid)
+ kvm_set_pfn_accessed(pfn);
+}
+
+/**
+ * kvm_handle_guest_abort - handles all 2nd stage aborts
+ * @vcpu: the VCPU pointer
+ * @run: the kvm_run structure
+ *
+ * Any abort that gets to the host is almost guaranteed to be caused by a
+ * missing second stage translation table entry, which can mean that either the
+ * guest simply needs more memory and we must allocate an appropriate page or it
+ * can mean that the guest tried to access I/O memory, which is emulated by user
+ * space. The distinction is based on the IPA causing the fault and whether this
+ * memory region has been registered as standard RAM by user space.
+ */
+int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+ unsigned long fault_status;
+ phys_addr_t fault_ipa;
+ struct kvm_memory_slot *memslot;
+ unsigned long hva;
+ bool is_iabt, write_fault, writable;
+ gfn_t gfn;
+ int ret, idx;
+
+ fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
+
+ fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
+
+ /* Synchronous External Abort? */
+ if (kvm_vcpu_dabt_isextabt(vcpu)) {
+ /*
+ * For RAS the host kernel may handle this abort.
+ * There is no need to pass the error into the guest.
+ */
+ if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+ return 1;
+
+ if (unlikely(!is_iabt)) {
+ kvm_inject_vabt(vcpu);
+ return 1;
+ }
+ }
+
+ trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
+ kvm_vcpu_get_hfar(vcpu), fault_ipa);
+
+ /* Check the stage-2 fault is trans. fault or write fault */
+ if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+ fault_status != FSC_ACCESS) {
+ kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
+ kvm_vcpu_trap_get_class(vcpu),
+ (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
+ (unsigned long)kvm_vcpu_get_hsr(vcpu));
+ return -EFAULT;
+ }
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ gfn = fault_ipa >> PAGE_SHIFT;
+ memslot = gfn_to_memslot(vcpu->kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
+ write_fault = kvm_is_write_fault(vcpu);
+ if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
+ if (is_iabt) {
+ /* Prefetch Abort on I/O address */
+ ret = -ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * Check for a cache maintenance operation. Since we
+ * ended-up here, we know it is outside of any memory
+ * slot. But we can't find out if that is for a device,
+ * or if the guest is just being stupid. The only thing
+ * we know for sure is that this range cannot be cached.
+ *
+ * So let's assume that the guest is just being
+ * cautious, and skip the instruction.
+ */
+ if (kvm_vcpu_dabt_is_cm(vcpu)) {
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+ ret = 1;
+ goto out_unlock;
+ }
+
+ /*
+ * The IPA is reported as [MAX:12], so we need to
+ * complement it with the bottom 12 bits from the
+ * faulting VA. This is always 12 bits, irrespective
+ * of the page size.
+ */
+ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
+ ret = io_mem_abort(vcpu, run, fault_ipa);
+ goto out_unlock;
+ }
+
+ /* Userspace should not be able to register out-of-bounds IPAs */
+ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm));
+
+ if (fault_status == FSC_ACCESS) {
+ handle_access_fault(vcpu, fault_ipa);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
+ if (ret == 0)
+ ret = 1;
+out:
+ if (ret == -ENOEXEC) {
+ kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ ret = 1;
+ }
+out_unlock:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ return ret;
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ int (*handler)(struct kvm *kvm,
+ gpa_t gpa, u64 size,
+ void *data),
+ void *data)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int ret = 0;
+
+ slots = kvm_memslots(kvm);
+
+ /* we only care about the pages that the guest sees */
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gpa;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+
+ gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT;
+ ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data);
+ }
+
+ return ret;
+}
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+ unmap_stage2_range(kvm, gpa, size);
+ return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
+{
+ if (!kvm->arch.pgd)
+ return 0;
+
+ trace_kvm_unmap_hva_range(start, end);
+ handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+ return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+ pte_t *pte = (pte_t *)data;
+
+ WARN_ON(size != PAGE_SIZE);
+ /*
+ * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
+ * flag clear because MMU notifiers will have unmapped a huge PMD before
+ * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+ * therefore stage2_set_pte() never needs to clear out a huge PMD
+ * through this calling path.
+ */
+ stage2_set_pte(kvm, NULL, gpa, pte, 0);
+ return 0;
+}
+
+
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ unsigned long end = hva + PAGE_SIZE;
+ kvm_pfn_t pfn = pte_pfn(pte);
+ pte_t stage2_pte;
+
+ if (!kvm->arch.pgd)
+ return 0;
+
+ trace_kvm_set_spte_hva(hva);
+
+ /*
+ * We've moved a page around, probably through CoW, so let's treat it
+ * just like a translation fault and clean the cache to the PoC.
+ */
+ clean_dcache_guest_page(pfn, PAGE_SIZE);
+ stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
+ handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
+
+ return 0;
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+ return 0;
+
+ if (pud)
+ return stage2_pudp_test_and_clear_young(pud);
+ else if (pmd)
+ return stage2_pmdp_test_and_clear_young(pmd);
+ else
+ return stage2_ptep_test_and_clear_young(pte);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
+ return 0;
+
+ if (pud)
+ return kvm_s2pud_young(*pud);
+ else if (pmd)
+ return pmd_young(*pmd);
+ else
+ return pte_young(*pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ if (!kvm->arch.pgd)
+ return 0;
+ trace_kvm_age_hva(start, end);
+ return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ if (!kvm->arch.pgd)
+ return 0;
+ trace_kvm_test_age_hva(hva);
+ return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
+ kvm_test_age_hva_handler, NULL);
+}
+
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
+phys_addr_t kvm_mmu_get_httbr(void)
+{
+ if (__kvm_cpu_uses_extended_idmap())
+ return virt_to_phys(merged_hyp_pgd);
+ else
+ return virt_to_phys(hyp_pgd);
+}
+
+phys_addr_t kvm_get_idmap_vector(void)
+{
+ return hyp_idmap_vector;
+}
+
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+ int err;
+
+ /* Create the idmap in the boot page tables */
+ err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
+ hyp_idmap_start, hyp_idmap_end,
+ __phys_to_pfn(hyp_idmap_start),
+ PAGE_HYP_EXEC);
+ if (err)
+ kvm_err("Failed to idmap %lx-%lx\n",
+ hyp_idmap_start, hyp_idmap_end);
+
+ return err;
+}
+
+int kvm_mmu_init(void)
+{
+ int err;
+
+ hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
+ hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
+ hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
+ hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
+ hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
+
+ /*
+ * We rely on the linker script to ensure at build time that the HYP
+ * init code does not cross a page boundary.
+ */
+ BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
+
+ kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
+ kvm_debug("HYP VA range: %lx:%lx\n",
+ kern_hyp_va(PAGE_OFFSET),
+ kern_hyp_va((unsigned long)high_memory - 1));
+
+ if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+ hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) &&
+ hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
+ /*
+ * The idmap page is intersecting with the VA space,
+ * it is not safe to continue further.
+ */
+ kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+ err = -EINVAL;
+ goto out;
+ }
+
+ hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+ if (!hyp_pgd) {
+ kvm_err("Hyp mode PGD not allocated\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (__kvm_cpu_uses_extended_idmap()) {
+ boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ hyp_pgd_order);
+ if (!boot_hyp_pgd) {
+ kvm_err("Hyp boot PGD not allocated\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_map_idmap_text(boot_hyp_pgd);
+ if (err)
+ goto out;
+
+ merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (!merged_hyp_pgd) {
+ kvm_err("Failed to allocate extra HYP pgd\n");
+ goto out;
+ }
+ __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
+ hyp_idmap_start);
+ } else {
+ err = kvm_map_idmap_text(hyp_pgd);
+ if (err)
+ goto out;
+ }
+
+ io_map_base = hyp_idmap_start;
+ return 0;
+out:
+ free_hyp_pgds();
+ return err;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ /*
+ * At this point memslot has been committed and there is an
+ * allocated dirty_bitmap[], dirty pages will be tracked while the
+ * memory slot is write protected.
+ */
+ if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ /*
+ * If we're with initial-all-set, we don't need to write
+ * protect any pages because they're all reported as dirty.
+ * Huge pages and normal pages will be write protect gradually.
+ */
+ if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+ kvm_mmu_wp_memory_region(kvm, mem->slot);
+ }
+ }
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ const struct kvm_userspace_memory_region *mem,
+ enum kvm_mr_change change)
+{
+ hva_t hva = mem->userspace_addr;
+ hva_t reg_end = hva + mem->memory_size;
+ bool writable = !(mem->flags & KVM_MEM_READONLY);
+ int ret = 0;
+
+ if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
+ change != KVM_MR_FLAGS_ONLY)
+ return 0;
+
+ /*
+ * Prevent userspace from creating a memory region outside of the IPA
+ * space addressable by the KVM guest IPA space.
+ */
+ if (memslot->base_gfn + memslot->npages >=
+ (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ return -EFAULT;
+
+ down_read(&current->mm->mmap_sem);
+ /*
+ * A memory region could potentially cover multiple VMAs, and any holes
+ * between them, so iterate over all of them to find out if we can map
+ * any of them right now.
+ *
+ * +--------------------------------------------+
+ * +---------------+----------------+ +----------------+
+ * | : VMA 1 | VMA 2 | | VMA 3 : |
+ * +---------------+----------------+ +----------------+
+ * | memory region |
+ * +--------------------------------------------+
+ */
+ do {
+ struct vm_area_struct *vma = find_vma(current->mm, hva);
+ hva_t vm_start, vm_end;
+
+ if (!vma || vma->vm_start >= reg_end)
+ break;
+
+ /*
+ * Take the intersection of this VMA with the memory region
+ */
+ vm_start = max(hva, vma->vm_start);
+ vm_end = min(reg_end, vma->vm_end);
+
+ if (vma->vm_flags & VM_PFNMAP) {
+ gpa_t gpa = mem->guest_phys_addr +
+ (vm_start - mem->userspace_addr);
+ phys_addr_t pa;
+
+ pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+ pa += vm_start - vma->vm_start;
+
+ /* IO region dirty page logging not allowed */
+ if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
+ vm_end - vm_start,
+ writable);
+ if (ret)
+ break;
+ }
+ hva = vm_end;
+ } while (hva < reg_end);
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ goto out;
+
+ spin_lock(&kvm->mmu_lock);
+ if (ret)
+ unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
+ else
+ stage2_flush_memslot(kvm, memslot);
+ spin_unlock(&kvm->mmu_lock);
+out:
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+}
+
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
+{
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ kvm_free_stage2_pgd(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
+ phys_addr_t size = slot->npages << PAGE_SHIFT;
+
+ spin_lock(&kvm->mmu_lock);
+ unmap_stage2_range(kvm, gpa, size);
+ spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
+ *
+ * Main problems:
+ * - S/W ops are local to a CPU (not broadcast)
+ * - We have line migration behind our back (speculation)
+ * - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the
+ * S/W to PA mapping, it can only use S/W to nuke the whole cache,
+ * which is a rather good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the cache maintenance instructions associated
+ * with the powerdown and powerup of caches, if this is required by
+ * the implementation.").
+ *
+ * We use the following policy:
+ *
+ * - If we trap a S/W operation, we enable VM trapping to detect
+ * caches being turned on/off, and do a full clean.
+ *
+ * - We flush the caches on both caches being turned on and off.
+ *
+ * - Once the caches are enabled, we stop trapping VM ops.
+ */
+void kvm_set_way_flush(struct kvm_vcpu *vcpu)
+{
+ unsigned long hcr = *vcpu_hcr(vcpu);
+
+ /*
+ * If this is the first time we do a S/W operation
+ * (i.e. HCR_TVM not set) flush the whole memory, and set the
+ * VM trapping.
+ *
+ * Otherwise, rely on the VM trapping to wait for the MMU +
+ * Caches to be turned off. At that point, we'll be able to
+ * clean the caches again.
+ */
+ if (!(hcr & HCR_TVM)) {
+ trace_kvm_set_way_flush(*vcpu_pc(vcpu),
+ vcpu_has_cache_enabled(vcpu));
+ stage2_flush_vm(vcpu->kvm);
+ *vcpu_hcr(vcpu) = hcr | HCR_TVM;
+ }
+}
+
+void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
+{
+ bool now_enabled = vcpu_has_cache_enabled(vcpu);
+
+ /*
+ * If switching the MMU+caches on, need to invalidate the caches.
+ * If switching it off, need to clean the caches.
+ * Clean + invalidate does the trick always.
+ */
+ if (now_enabled != was_enabled)
+ stage2_flush_vm(vcpu->kvm);
+
+ /* Caches are now on, stop trapping VM ops (until a S/W op) */
+ if (now_enabled)
+ *vcpu_hcr(vcpu) &= ~HCR_TVM;
+
+ trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
+}
diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c
new file mode 100644
index 000000000000..d45b8b9a4415
--- /dev/null
+++ b/arch/arm64/kvm/perf.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Based on the x86 implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+static int kvm_is_in_guest(void)
+{
+ return kvm_get_running_vcpu() != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_running_vcpu();
+
+ if (vcpu)
+ return !vcpu_mode_priv(vcpu);
+
+ return 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_running_vcpu();
+
+ if (vcpu)
+ return *vcpu_pc(vcpu);
+
+ return 0;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+ .is_in_guest = kvm_is_in_guest,
+ .is_user_mode = kvm_is_user_mode,
+ .get_guest_ip = kvm_get_guest_ip,
+};
+
+int kvm_perf_init(void)
+{
+ return perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+
+int kvm_perf_teardown(void)
+{
+ return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
new file mode 100644
index 000000000000..f0d0312c0a55
--- /dev/null
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -0,0 +1,869 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Shannon Zhao <shannon.zhao@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_emulate.h>
+#include <kvm/arm_pmu.h>
+#include <kvm/arm_vgic.h>
+
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx);
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
+
+#define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
+
+/**
+ * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_64bit(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ return (select_idx == ARMV8_PMU_CYCLE_IDX &&
+ __vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_LC);
+}
+
+static struct kvm_vcpu *kvm_pmc_to_vcpu(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu;
+ struct kvm_vcpu_arch *vcpu_arch;
+
+ pmc -= pmc->idx;
+ pmu = container_of(pmc, struct kvm_pmu, pmc[0]);
+ vcpu_arch = container_of(pmu, struct kvm_vcpu_arch, pmu);
+ return container_of(vcpu_arch, struct kvm_vcpu, arch);
+}
+
+/**
+ * kvm_pmu_pmc_is_chained - determine if the pmc is chained
+ * @pmc: The PMU counter pointer
+ */
+static bool kvm_pmu_pmc_is_chained(struct kvm_pmc *pmc)
+{
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+
+ return test_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_idx_is_high_counter - determine if select_idx is a high/low counter
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_is_high_counter(u64 select_idx)
+{
+ return select_idx & 0x1;
+}
+
+/**
+ * kvm_pmu_get_canonical_pmc - obtain the canonical pmc
+ * @pmc: The PMU counter pointer
+ *
+ * When a pair of PMCs are chained together we use the low counter (canonical)
+ * to hold the underlying perf event.
+ */
+static struct kvm_pmc *kvm_pmu_get_canonical_pmc(struct kvm_pmc *pmc)
+{
+ if (kvm_pmu_pmc_is_chained(pmc) &&
+ kvm_pmu_idx_is_high_counter(pmc->idx))
+ return pmc - 1;
+
+ return pmc;
+}
+static struct kvm_pmc *kvm_pmu_get_alternate_pmc(struct kvm_pmc *pmc)
+{
+ if (kvm_pmu_idx_is_high_counter(pmc->idx))
+ return pmc - 1;
+ else
+ return pmc + 1;
+}
+
+/**
+ * kvm_pmu_idx_has_chain_evtype - determine if the event type is chain
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ u64 eventsel, reg;
+
+ select_idx |= 0x1;
+
+ if (select_idx == ARMV8_PMU_CYCLE_IDX)
+ return false;
+
+ reg = PMEVTYPER0_EL0 + select_idx;
+ eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+
+ return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
+}
+
+/**
+ * kvm_pmu_get_pair_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @pmc: The PMU counter pointer
+ */
+static u64 kvm_pmu_get_pair_counter_value(struct kvm_vcpu *vcpu,
+ struct kvm_pmc *pmc)
+{
+ u64 counter, counter_high, reg, enabled, running;
+
+ if (kvm_pmu_pmc_is_chained(pmc)) {
+ pmc = kvm_pmu_get_canonical_pmc(pmc);
+ reg = PMEVCNTR0_EL0 + pmc->idx;
+
+ counter = __vcpu_sys_reg(vcpu, reg);
+ counter_high = __vcpu_sys_reg(vcpu, reg + 1);
+
+ counter = lower_32_bits(counter) | (counter_high << 32);
+ } else {
+ reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + pmc->idx;
+ counter = __vcpu_sys_reg(vcpu, reg);
+ }
+
+ /*
+ * The real counter value is equal to the value of counter register plus
+ * the value perf event counts.
+ */
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event, &enabled,
+ &running);
+
+ return counter;
+}
+
+/**
+ * kvm_pmu_get_counter_value - get PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ u64 counter;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc = &pmu->pmc[select_idx];
+
+ counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+ if (kvm_pmu_pmc_is_chained(pmc) &&
+ kvm_pmu_idx_is_high_counter(select_idx))
+ counter = upper_32_bits(counter);
+ else if (select_idx != ARMV8_PMU_CYCLE_IDX)
+ counter = lower_32_bits(counter);
+
+ return counter;
+}
+
+/**
+ * kvm_pmu_set_counter_value - set PMU counter value
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ * @val: The counter value
+ */
+void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
+{
+ u64 reg;
+
+ reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+ ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + select_idx;
+ __vcpu_sys_reg(vcpu, reg) += (s64)val - kvm_pmu_get_counter_value(vcpu, select_idx);
+
+ /* Recreate the perf event to reflect the updated sample_period */
+ kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+/**
+ * kvm_pmu_release_perf_event - remove the perf event
+ * @pmc: The PMU counter pointer
+ */
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+ pmc = kvm_pmu_get_canonical_pmc(pmc);
+ if (pmc->perf_event) {
+ perf_event_disable(pmc->perf_event);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+/**
+ * kvm_pmu_stop_counter - stop PMU counter
+ * @pmc: The PMU counter pointer
+ *
+ * If this counter has been configured to monitor some event, release it here.
+ */
+static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc)
+{
+ u64 counter, reg, val;
+
+ pmc = kvm_pmu_get_canonical_pmc(pmc);
+ if (!pmc->perf_event)
+ return;
+
+ counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+ if (pmc->idx == ARMV8_PMU_CYCLE_IDX) {
+ reg = PMCCNTR_EL0;
+ val = counter;
+ } else {
+ reg = PMEVCNTR0_EL0 + pmc->idx;
+ val = lower_32_bits(counter);
+ }
+
+ __vcpu_sys_reg(vcpu, reg) = val;
+
+ if (kvm_pmu_pmc_is_chained(pmc))
+ __vcpu_sys_reg(vcpu, reg + 1) = upper_32_bits(counter);
+
+ kvm_pmu_release_perf_event(pmc);
+}
+
+/**
+ * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+ for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ pmu->pmc[i].idx = i;
+}
+
+/**
+ * kvm_pmu_vcpu_reset - reset pmu state for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ int i;
+
+ for_each_set_bit(i, &mask, 32)
+ kvm_pmu_stop_counter(vcpu, &pmu->pmc[i]);
+
+ bitmap_zero(vcpu->arch.pmu.chained, ARMV8_PMU_MAX_COUNTER_PAIRS);
+}
+
+/**
+ * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
+ * @vcpu: The vcpu pointer
+ *
+ */
+void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+ for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++)
+ kvm_pmu_release_perf_event(&pmu->pmc[i]);
+}
+
+u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, PMCR_EL0) >> ARMV8_PMU_PMCR_N_SHIFT;
+
+ val &= ARMV8_PMU_PMCR_N_MASK;
+ if (val == 0)
+ return BIT(ARMV8_PMU_CYCLE_IDX);
+ else
+ return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
+}
+
+/**
+ * kvm_pmu_enable_counter_mask - enable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENSET register
+ *
+ * Call perf_event_enable to start counting the perf event
+ */
+void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+ int i;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc;
+
+ if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) || !val)
+ return;
+
+ for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+ if (!(val & BIT(i)))
+ continue;
+
+ pmc = &pmu->pmc[i];
+
+ /* A change in the enable state may affect the chain state */
+ kvm_pmu_update_pmc_chained(vcpu, i);
+ kvm_pmu_create_perf_event(vcpu, i);
+
+ /* At this point, pmc must be the canonical */
+ if (pmc->perf_event) {
+ perf_event_enable(pmc->perf_event);
+ if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
+ kvm_debug("fail to enable perf event\n");
+ }
+ }
+}
+
+/**
+ * kvm_pmu_disable_counter_mask - disable selected PMU counters
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCNTENCLR register
+ *
+ * Call perf_event_disable to stop counting the perf event
+ */
+void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val)
+{
+ int i;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc;
+
+ if (!val)
+ return;
+
+ for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) {
+ if (!(val & BIT(i)))
+ continue;
+
+ pmc = &pmu->pmc[i];
+
+ /* A change in the enable state may affect the chain state */
+ kvm_pmu_update_pmc_chained(vcpu, i);
+ kvm_pmu_create_perf_event(vcpu, i);
+
+ /* At this point, pmc must be the canonical */
+ if (pmc->perf_event)
+ perf_event_disable(pmc->perf_event);
+ }
+}
+
+static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
+{
+ u64 reg = 0;
+
+ if ((__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E)) {
+ reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
+ reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
+ reg &= kvm_pmu_valid_counter_mask(vcpu);
+ }
+
+ return reg;
+}
+
+static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ bool overflow;
+
+ if (!kvm_arm_pmu_v3_ready(vcpu))
+ return;
+
+ overflow = !!kvm_pmu_overflow_status(vcpu);
+ if (pmu->irq_level == overflow)
+ return;
+
+ pmu->irq_level = overflow;
+
+ if (likely(irqchip_in_kernel(vcpu->kvm))) {
+ int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+ pmu->irq_num, overflow, pmu);
+ WARN_ON(ret);
+ }
+}
+
+bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
+ bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;
+
+ if (likely(irqchip_in_kernel(vcpu->kvm)))
+ return false;
+
+ return pmu->irq_level != run_level;
+}
+
+/*
+ * Reflect the PMU overflow interrupt output level into the kvm_run structure
+ */
+void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
+{
+ struct kvm_sync_regs *regs = &vcpu->run->s.regs;
+
+ /* Populate the timer bitmap for user space */
+ regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
+ if (vcpu->arch.pmu.irq_level)
+ regs->device_irq_level |= KVM_ARM_DEV_PMU;
+}
+
+/**
+ * kvm_pmu_flush_hwstate - flush pmu state to cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the host, and inject
+ * an interrupt if that was the case.
+ */
+void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * kvm_pmu_sync_hwstate - sync pmu state from cpu
+ * @vcpu: The vcpu pointer
+ *
+ * Check if the PMU has overflowed while we were running in the guest, and
+ * inject an interrupt if that was the case.
+ */
+void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_update_state(vcpu);
+}
+
+/**
+ * When the perf event overflows, set the overflow status and inform the vcpu.
+ */
+static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
+ struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
+ int idx = pmc->idx;
+ u64 period;
+
+ cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);
+
+ /*
+ * Reset the sample period to the architectural limit,
+ * i.e. the point where the counter overflows.
+ */
+ period = -(local64_read(&perf_event->count));
+
+ if (!kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+ period &= GENMASK(31, 0);
+
+ local64_set(&perf_event->hw.period_left, 0);
+ perf_event->attr.sample_period = period;
+ perf_event->hw.sample_period = period;
+
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+
+ if (kvm_pmu_overflow_status(vcpu)) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
+}
+
+/**
+ * kvm_pmu_software_increment - do software increment
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMSWINC register
+ */
+void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ int i;
+
+ if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+ return;
+
+ /* Weed out disabled counters */
+ val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+
+ for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
+ u64 type, reg;
+
+ if (!(val & BIT(i)))
+ continue;
+
+ /* PMSWINC only applies to ... SW_INC! */
+ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
+ type &= ARMV8_PMU_EVTYPE_EVENT;
+ if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
+ continue;
+
+ /* increment this even SW_INC counter */
+ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+ reg = lower_32_bits(reg);
+ __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
+
+ if (reg) /* no overflow on the low part */
+ continue;
+
+ if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
+ /* increment the high counter */
+ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
+ reg = lower_32_bits(reg);
+ __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
+ if (!reg) /* mark overflow on the high counter */
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
+ } else {
+ /* mark overflow on low counter */
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+ }
+ }
+}
+
+/**
+ * kvm_pmu_handle_pmcr - handle PMCR register
+ * @vcpu: The vcpu pointer
+ * @val: the value guest writes to PMCR register
+ */
+void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
+{
+ unsigned long mask = kvm_pmu_valid_counter_mask(vcpu);
+ int i;
+
+ if (val & ARMV8_PMU_PMCR_E) {
+ kvm_pmu_enable_counter_mask(vcpu,
+ __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask);
+ } else {
+ kvm_pmu_disable_counter_mask(vcpu, mask);
+ }
+
+ if (val & ARMV8_PMU_PMCR_C)
+ kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
+
+ if (val & ARMV8_PMU_PMCR_P) {
+ for_each_set_bit(i, &mask, 32)
+ kvm_pmu_set_counter_value(vcpu, i, 0);
+ }
+}
+
+static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
+ (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(select_idx));
+}
+
+/**
+ * kvm_pmu_create_perf_event - create a perf event for a counter
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ */
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc;
+ struct perf_event *event;
+ struct perf_event_attr attr;
+ u64 eventsel, counter, reg, data;
+
+ /*
+ * For chained counters the event type and filtering attributes are
+ * obtained from the low/even counter. We also use this counter to
+ * determine if the event is enabled/disabled.
+ */
+ pmc = kvm_pmu_get_canonical_pmc(&pmu->pmc[select_idx]);
+
+ reg = (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+ ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + pmc->idx;
+ data = __vcpu_sys_reg(vcpu, reg);
+
+ kvm_pmu_stop_counter(vcpu, pmc);
+ eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+
+ /* Software increment event does't need to be backed by a perf event */
+ if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
+ pmc->idx != ARMV8_PMU_CYCLE_IDX)
+ return;
+
+ memset(&attr, 0, sizeof(struct perf_event_attr));
+ attr.type = PERF_TYPE_RAW;
+ attr.size = sizeof(attr);
+ attr.pinned = 1;
+ attr.disabled = !kvm_pmu_counter_is_enabled(vcpu, pmc->idx);
+ attr.exclude_user = data & ARMV8_PMU_EXCLUDE_EL0 ? 1 : 0;
+ attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
+ attr.exclude_hv = 1; /* Don't count EL2 events */
+ attr.exclude_host = 1; /* Don't count host events */
+ attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
+ ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+
+ counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
+
+ if (kvm_pmu_pmc_is_chained(pmc)) {
+ /**
+ * The initial sample period (overflow count) of an event. For
+ * chained counters we only support overflow interrupts on the
+ * high counter.
+ */
+ attr.sample_period = (-counter) & GENMASK(63, 0);
+ attr.config1 |= PERF_ATTR_CFG1_KVM_PMU_CHAINED;
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow,
+ pmc + 1);
+ } else {
+ /* The initial sample period (overflow count) of an event. */
+ if (kvm_pmu_idx_is_64bit(vcpu, pmc->idx))
+ attr.sample_period = (-counter) & GENMASK(63, 0);
+ else
+ attr.sample_period = (-counter) & GENMASK(31, 0);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ kvm_pmu_perf_overflow, pmc);
+ }
+
+ if (IS_ERR(event)) {
+ pr_err_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return;
+ }
+
+ pmc->perf_event = event;
+}
+
+/**
+ * kvm_pmu_update_pmc_chained - update chained bitmap
+ * @vcpu: The vcpu pointer
+ * @select_idx: The number of selected counter
+ *
+ * Update the chained bitmap based on the event type written in the
+ * typer register and the enable state of the odd register.
+ */
+static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc = &pmu->pmc[select_idx], *canonical_pmc;
+ bool new_state, old_state;
+
+ old_state = kvm_pmu_pmc_is_chained(pmc);
+ new_state = kvm_pmu_idx_has_chain_evtype(vcpu, pmc->idx) &&
+ kvm_pmu_counter_is_enabled(vcpu, pmc->idx | 0x1);
+
+ if (old_state == new_state)
+ return;
+
+ canonical_pmc = kvm_pmu_get_canonical_pmc(pmc);
+ kvm_pmu_stop_counter(vcpu, canonical_pmc);
+ if (new_state) {
+ /*
+ * During promotion from !chained to chained we must ensure
+ * the adjacent counter is stopped and its event destroyed
+ */
+ kvm_pmu_stop_counter(vcpu, kvm_pmu_get_alternate_pmc(pmc));
+ set_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+ return;
+ }
+ clear_bit(pmc->idx >> 1, vcpu->arch.pmu.chained);
+}
+
+/**
+ * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
+ * @vcpu: The vcpu pointer
+ * @data: The data guest writes to PMXEVTYPER_EL0
+ * @select_idx: The number of selected counter
+ *
+ * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
+ * event with given hardware event number. Here we call perf_event API to
+ * emulate this action and create a kernel perf event for it.
+ */
+void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
+ u64 select_idx)
+{
+ u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+
+ reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
+ ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
+
+ __vcpu_sys_reg(vcpu, reg) = event_type;
+
+ kvm_pmu_update_pmc_chained(vcpu, select_idx);
+ kvm_pmu_create_perf_event(vcpu, select_idx);
+}
+
+bool kvm_arm_support_pmu_v3(void)
+{
+ /*
+ * Check if HW_PERF_EVENTS are supported by checking the number of
+ * hardware performance counters. This could ensure the presence of
+ * a physical PMU and CONFIG_PERF_EVENT is selected.
+ */
+ return (perf_num_counters() > 0);
+}
+
+int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.pmu.created)
+ return 0;
+
+ /*
+ * A valid interrupt configuration for the PMU is either to have a
+ * properly configured interrupt number and using an in-kernel
+ * irqchip, or to not have an in-kernel GIC and not set an IRQ.
+ */
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ int irq = vcpu->arch.pmu.irq_num;
+ if (!kvm_arm_pmu_irq_initialized(vcpu))
+ return -EINVAL;
+
+ /*
+ * If we are using an in-kernel vgic, at this point we know
+ * the vgic will be initialized, so we can check the PMU irq
+ * number against the dimensions of the vgic and make sure
+ * it's valid.
+ */
+ if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
+ return -EINVAL;
+ } else if (kvm_arm_pmu_irq_initialized(vcpu)) {
+ return -EINVAL;
+ }
+
+ kvm_pmu_vcpu_reset(vcpu);
+ vcpu->arch.pmu.ready = true;
+
+ return 0;
+}
+
+static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_arm_support_pmu_v3())
+ return -ENODEV;
+
+ if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENXIO;
+
+ if (vcpu->arch.pmu.created)
+ return -EBUSY;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ int ret;
+
+ /*
+ * If using the PMU with an in-kernel virtual GIC
+ * implementation, we require the GIC to be already
+ * initialized when initializing the PMU.
+ */
+ if (!vgic_initialized(vcpu->kvm))
+ return -ENODEV;
+
+ if (!kvm_arm_pmu_irq_initialized(vcpu))
+ return -ENXIO;
+
+ ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
+ &vcpu->arch.pmu);
+ if (ret)
+ return ret;
+ }
+
+ vcpu->arch.pmu.created = true;
+ return 0;
+}
+
+/*
+ * For one VM the interrupt type must be same for each vcpu.
+ * As a PPI, the interrupt number is the same for all vcpus,
+ * while as an SPI it must be a separate number per vcpu.
+ */
+static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_arm_pmu_irq_initialized(vcpu))
+ continue;
+
+ if (irq_is_ppi(irq)) {
+ if (vcpu->arch.pmu.irq_num != irq)
+ return false;
+ } else {
+ if (vcpu->arch.pmu.irq_num == irq)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_PMU_V3_IRQ: {
+ int __user *uaddr = (int __user *)(long)attr->addr;
+ int irq;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+
+ if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (get_user(irq, uaddr))
+ return -EFAULT;
+
+ /* The PMU overflow interrupt can be a PPI or a valid SPI. */
+ if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
+ return -EINVAL;
+
+ if (!pmu_irq_is_valid(vcpu->kvm, irq))
+ return -EINVAL;
+
+ if (kvm_arm_pmu_irq_initialized(vcpu))
+ return -EBUSY;
+
+ kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
+ vcpu->arch.pmu.irq_num = irq;
+ return 0;
+ }
+ case KVM_ARM_VCPU_PMU_V3_INIT:
+ return kvm_arm_pmu_v3_init(vcpu);
+ }
+
+ return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_PMU_V3_IRQ: {
+ int __user *uaddr = (int __user *)(long)attr->addr;
+ int irq;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return -EINVAL;
+
+ if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return -ENODEV;
+
+ if (!kvm_arm_pmu_irq_initialized(vcpu))
+ return -ENXIO;
+
+ irq = vcpu->arch.pmu.irq_num;
+ return put_user(irq, uaddr);
+ }
+ }
+
+ return -ENXIO;
+}
+
+int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_PMU_V3_IRQ:
+ case KVM_ARM_VCPU_PMU_V3_INIT:
+ if (kvm_arm_support_pmu_v3() &&
+ test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+ return 0;
+ }
+
+ return -ENXIO;
+}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
new file mode 100644
index 000000000000..83415e96b589
--- /dev/null
+++ b/arch/arm64/kvm/psci.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/preempt.h>
+#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <linux/wait.h>
+
+#include <asm/cputype.h>
+#include <asm/kvm_emulate.h>
+
+#include <kvm/arm_psci.h>
+#include <kvm/arm_hypercalls.h>
+
+/*
+ * This is an implementation of the Power State Coordination Interface
+ * as described in ARM document number ARM DEN 0022A.
+ */
+
+#define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
+
+static unsigned long psci_affinity_mask(unsigned long affinity_level)
+{
+ if (affinity_level <= 3)
+ return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);
+
+ return 0;
+}
+
+static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
+{
+ /*
+ * NOTE: For simplicity, we make VCPU suspend emulation to be
+ * same-as WFI (Wait-for-interrupt) emulation.
+ *
+ * This means for KVM the wakeup events are interrupts and
+ * this is consistent with intended use of StateID as described
+ * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
+ *
+ * Further, we also treat power-down request to be same as
+ * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
+ * specification (ARM DEN 0022A). This means all suspend states
+ * for KVM will preserve the register state.
+ */
+ kvm_vcpu_block(vcpu);
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+
+ return PSCI_RET_SUCCESS;
+}
+
+static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.power_off = true;
+ kvm_make_request(KVM_REQ_SLEEP, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
+{
+ struct vcpu_reset_state *reset_state;
+ struct kvm *kvm = source_vcpu->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ unsigned long cpu_id;
+
+ cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
+ if (vcpu_mode_is_32bit(source_vcpu))
+ cpu_id &= ~((u32) 0);
+
+ vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);
+
+ /*
+ * Make sure the caller requested a valid CPU and that the CPU is
+ * turned off.
+ */
+ if (!vcpu)
+ return PSCI_RET_INVALID_PARAMS;
+ if (!vcpu->arch.power_off) {
+ if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
+ return PSCI_RET_ALREADY_ON;
+ else
+ return PSCI_RET_INVALID_PARAMS;
+ }
+
+ reset_state = &vcpu->arch.reset_state;
+
+ reset_state->pc = smccc_get_arg2(source_vcpu);
+
+ /* Propagate caller endianness */
+ reset_state->be = kvm_vcpu_is_be(source_vcpu);
+
+ /*
+ * NOTE: We always update r0 (or x0) because for PSCI v0.1
+ * the general purpose registers are undefined upon CPU_ON.
+ */
+ reset_state->r0 = smccc_get_arg3(source_vcpu);
+
+ WRITE_ONCE(reset_state->reset, true);
+ kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
+
+ /*
+ * Make sure the reset request is observed if the change to
+ * power_state is observed.
+ */
+ smp_wmb();
+
+ vcpu->arch.power_off = false;
+ kvm_vcpu_wake_up(vcpu);
+
+ return PSCI_RET_SUCCESS;
+}
+
+static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
+{
+ int i, matching_cpus = 0;
+ unsigned long mpidr;
+ unsigned long target_affinity;
+ unsigned long target_affinity_mask;
+ unsigned long lowest_affinity_level;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *tmp;
+
+ target_affinity = smccc_get_arg1(vcpu);
+ lowest_affinity_level = smccc_get_arg2(vcpu);
+
+ /* Determine target affinity mask */
+ target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+ if (!target_affinity_mask)
+ return PSCI_RET_INVALID_PARAMS;
+
+ /* Ignore other bits of target affinity */
+ target_affinity &= target_affinity_mask;
+
+ /*
+ * If one or more VCPU matching target affinity are running
+ * then ON else OFF
+ */
+ kvm_for_each_vcpu(i, tmp, kvm) {
+ mpidr = kvm_vcpu_get_mpidr_aff(tmp);
+ if ((mpidr & target_affinity_mask) == target_affinity) {
+ matching_cpus++;
+ if (!tmp->arch.power_off)
+ return PSCI_0_2_AFFINITY_LEVEL_ON;
+ }
+ }
+
+ if (!matching_cpus)
+ return PSCI_RET_INVALID_PARAMS;
+
+ return PSCI_0_2_AFFINITY_LEVEL_OFF;
+}
+
+static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
+{
+ int i;
+ struct kvm_vcpu *tmp;
+
+ /*
+ * The KVM ABI specifies that a system event exit may call KVM_RUN
+ * again and may perform shutdown/reboot at a later time that when the
+ * actual request is made. Since we are implementing PSCI and a
+ * caller of PSCI reboot and shutdown expects that the system shuts
+ * down or reboots immediately, let's make sure that VCPUs are not run
+ * after this call is handled and before the VCPUs have been
+ * re-initialized.
+ */
+ kvm_for_each_vcpu(i, tmp, vcpu->kvm)
+ tmp->arch.power_off = true;
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
+
+ memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
+ vcpu->run->system_event.type = type;
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+}
+
+static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
+{
+ kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN);
+}
+
+static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
+}
+
+static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ /*
+ * Zero the input registers' upper 32 bits. They will be fully
+ * zeroed on exit, so we're fine changing them in place.
+ */
+ for (i = 1; i < 4; i++)
+ vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
+}
+
+static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
+{
+ switch(fn) {
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ case PSCI_0_2_FN64_CPU_ON:
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ /* Disallow these functions for 32bit guests */
+ if (vcpu_mode_is_32bit(vcpu))
+ return PSCI_RET_NOT_SUPPORTED;
+ break;
+ }
+
+ return 0;
+}
+
+static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 psci_fn = smccc_get_function(vcpu);
+ unsigned long val;
+ int ret = 1;
+
+ val = kvm_psci_check_allowed_function(vcpu, psci_fn);
+ if (val)
+ goto out;
+
+ switch (psci_fn) {
+ case PSCI_0_2_FN_PSCI_VERSION:
+ /*
+ * Bits[31:16] = Major Version = 0
+ * Bits[15:0] = Minor Version = 2
+ */
+ val = KVM_ARM_PSCI_0_2;
+ break;
+ case PSCI_0_2_FN_CPU_SUSPEND:
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ val = kvm_psci_vcpu_suspend(vcpu);
+ break;
+ case PSCI_0_2_FN_CPU_OFF:
+ kvm_psci_vcpu_off(vcpu);
+ val = PSCI_RET_SUCCESS;
+ break;
+ case PSCI_0_2_FN_CPU_ON:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
+ case PSCI_0_2_FN64_CPU_ON:
+ mutex_lock(&kvm->lock);
+ val = kvm_psci_vcpu_on(vcpu);
+ mutex_unlock(&kvm->lock);
+ break;
+ case PSCI_0_2_FN_AFFINITY_INFO:
+ kvm_psci_narrow_to_32bit(vcpu);
+ fallthrough;
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ val = kvm_psci_vcpu_affinity_info(vcpu);
+ break;
+ case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+ /*
+ * Trusted OS is MP hence does not require migration
+ * or
+ * Trusted OS is not present
+ */
+ val = PSCI_0_2_TOS_MP;
+ break;
+ case PSCI_0_2_FN_SYSTEM_OFF:
+ kvm_psci_system_off(vcpu);
+ /*
+ * We shouldn't be going back to guest VCPU after
+ * receiving SYSTEM_OFF request.
+ *
+ * If user space accidentally/deliberately resumes
+ * guest VCPU after SYSTEM_OFF request then guest
+ * VCPU should see internal failure from PSCI return
+ * value. To achieve this, we preload r0 (or x0) with
+ * PSCI return value INTERNAL_FAILURE.
+ */
+ val = PSCI_RET_INTERNAL_FAILURE;
+ ret = 0;
+ break;
+ case PSCI_0_2_FN_SYSTEM_RESET:
+ kvm_psci_system_reset(vcpu);
+ /*
+ * Same reason as SYSTEM_OFF for preloading r0 (or x0)
+ * with PSCI return value INTERNAL_FAILURE.
+ */
+ val = PSCI_RET_INTERNAL_FAILURE;
+ ret = 0;
+ break;
+ default:
+ val = PSCI_RET_NOT_SUPPORTED;
+ break;
+ }
+
+out:
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return ret;
+}
+
+static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
+{
+ u32 psci_fn = smccc_get_function(vcpu);
+ u32 feature;
+ unsigned long val;
+ int ret = 1;
+
+ switch(psci_fn) {
+ case PSCI_0_2_FN_PSCI_VERSION:
+ val = KVM_ARM_PSCI_1_0;
+ break;
+ case PSCI_1_0_FN_PSCI_FEATURES:
+ feature = smccc_get_arg1(vcpu);
+ val = kvm_psci_check_allowed_function(vcpu, feature);
+ if (val)
+ break;
+
+ switch(feature) {
+ case PSCI_0_2_FN_PSCI_VERSION:
+ case PSCI_0_2_FN_CPU_SUSPEND:
+ case PSCI_0_2_FN64_CPU_SUSPEND:
+ case PSCI_0_2_FN_CPU_OFF:
+ case PSCI_0_2_FN_CPU_ON:
+ case PSCI_0_2_FN64_CPU_ON:
+ case PSCI_0_2_FN_AFFINITY_INFO:
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
+ case PSCI_0_2_FN_SYSTEM_OFF:
+ case PSCI_0_2_FN_SYSTEM_RESET:
+ case PSCI_1_0_FN_PSCI_FEATURES:
+ case ARM_SMCCC_VERSION_FUNC_ID:
+ val = 0;
+ break;
+ default:
+ val = PSCI_RET_NOT_SUPPORTED;
+ break;
+ }
+ break;
+ default:
+ return kvm_psci_0_2_call(vcpu);
+ }
+
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return ret;
+}
+
+static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 psci_fn = smccc_get_function(vcpu);
+ unsigned long val;
+
+ switch (psci_fn) {
+ case KVM_PSCI_FN_CPU_OFF:
+ kvm_psci_vcpu_off(vcpu);
+ val = PSCI_RET_SUCCESS;
+ break;
+ case KVM_PSCI_FN_CPU_ON:
+ mutex_lock(&kvm->lock);
+ val = kvm_psci_vcpu_on(vcpu);
+ mutex_unlock(&kvm->lock);
+ break;
+ default:
+ val = PSCI_RET_NOT_SUPPORTED;
+ break;
+ }
+
+ smccc_set_retval(vcpu, val, 0, 0, 0);
+ return 1;
+}
+
+/**
+ * kvm_psci_call - handle PSCI call if r0 value is in range
+ * @vcpu: Pointer to the VCPU struct
+ *
+ * Handle PSCI calls from guests through traps from HVC instructions.
+ * The calling convention is similar to SMC calls to the secure world
+ * where the function number is placed in r0.
+ *
+ * This function returns: > 0 (success), 0 (success but exit to user
+ * space), and < 0 (errors)
+ *
+ * Errors:
+ * -EINVAL: Unrecognized PSCI function
+ */
+int kvm_psci_call(struct kvm_vcpu *vcpu)
+{
+ switch (kvm_psci_version(vcpu, vcpu->kvm)) {
+ case KVM_ARM_PSCI_1_0:
+ return kvm_psci_1_0_call(vcpu);
+ case KVM_ARM_PSCI_0_2:
+ return kvm_psci_0_2_call(vcpu);
+ case KVM_ARM_PSCI_0_1:
+ return kvm_psci_0_1_call(vcpu);
+ default:
+ return -EINVAL;
+ };
+}
+
+int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
+{
+ return 3; /* PSCI version and two workaround registers */
+}
+
+int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
+{
+ if (put_user(KVM_REG_ARM_PSCI_VERSION, uindices++))
+ return -EFAULT;
+
+ if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1, uindices++))
+ return -EFAULT;
+
+ if (put_user(KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2, uindices++))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define KVM_REG_FEATURE_LEVEL_WIDTH 4
+#define KVM_REG_FEATURE_LEVEL_MASK (BIT(KVM_REG_FEATURE_LEVEL_WIDTH) - 1)
+
+/*
+ * Convert the workaround level into an easy-to-compare number, where higher
+ * values mean better protection.
+ */
+static int get_kernel_wa_level(u64 regid)
+{
+ switch (regid) {
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+ switch (kvm_arm_harden_branch_predictor()) {
+ case KVM_BP_HARDEN_UNKNOWN:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+ case KVM_BP_HARDEN_WA_NEEDED:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
+ case KVM_BP_HARDEN_NOT_REQUIRED:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
+ }
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+ switch (kvm_arm_have_ssbd()) {
+ case KVM_SSBD_FORCE_DISABLE:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+ case KVM_SSBD_KERNEL:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
+ case KVM_SSBD_FORCE_ENABLE:
+ case KVM_SSBD_MITIGATED:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
+ case KVM_SSBD_UNKNOWN:
+ default:
+ return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+ }
+ }
+
+ return -EINVAL;
+}
+
+int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+ void __user *uaddr = (void __user *)(long)reg->addr;
+ u64 val;
+
+ switch (reg->id) {
+ case KVM_REG_ARM_PSCI_VERSION:
+ val = kvm_psci_version(vcpu, vcpu->kvm);
+ break;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+ val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+ break;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+ val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
+
+ if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+ kvm_arm_get_vcpu_workaround_2_flag(vcpu))
+ val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
+{
+ void __user *uaddr = (void __user *)(long)reg->addr;
+ u64 val;
+ int wa_level;
+
+ if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
+ return -EFAULT;
+
+ switch (reg->id) {
+ case KVM_REG_ARM_PSCI_VERSION:
+ {
+ bool wants_02;
+
+ wants_02 = test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features);
+
+ switch (val) {
+ case KVM_ARM_PSCI_0_1:
+ if (wants_02)
+ return -EINVAL;
+ vcpu->kvm->arch.psci_version = val;
+ return 0;
+ case KVM_ARM_PSCI_0_2:
+ case KVM_ARM_PSCI_1_0:
+ if (!wants_02)
+ return -EINVAL;
+ vcpu->kvm->arch.psci_version = val;
+ return 0;
+ }
+ break;
+ }
+
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+ if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
+ return -EINVAL;
+
+ if (get_kernel_wa_level(reg->id) < val)
+ return -EINVAL;
+
+ return 0;
+
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+ if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
+ return -EINVAL;
+
+ wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
+
+ if (get_kernel_wa_level(reg->id) < wa_level)
+ return -EINVAL;
+
+ /* The enabled bit must not be set unless the level is AVAIL. */
+ if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
+ wa_level != val)
+ return -EINVAL;
+
+ /* Are we finished or do we need to check the enable bit ? */
+ if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
+ return 0;
+
+ /*
+ * If this kernel supports the workaround to be switched on
+ * or off, make sure it matches the requested setting.
+ */
+ switch (wa_level) {
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
+ kvm_arm_set_vcpu_workaround_2_flag(vcpu,
+ val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
+ break;
+ case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
+ kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+ break;
+ }
+
+ return 0;
+ default:
+ return -ENOENT;
+ }
+
+ return -EINVAL;
+}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
new file mode 100644
index 000000000000..1e0f4c284888
--- /dev/null
+++ b/arch/arm64/kvm/pvtime.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2019 Arm Ltd.
+
+#include <linux/arm-smccc.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_mmu.h>
+#include <asm/pvclock-abi.h>
+
+#include <kvm/arm_hypercalls.h>
+
+void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 steal;
+ __le64 steal_le;
+ u64 offset;
+ int idx;
+ u64 base = vcpu->arch.steal.base;
+
+ if (base == GPA_INVALID)
+ return;
+
+ /* Let's do the local bookkeeping */
+ steal = vcpu->arch.steal.steal;
+ steal += current->sched_info.run_delay - vcpu->arch.steal.last_steal;
+ vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+ vcpu->arch.steal.steal = steal;
+
+ steal_le = cpu_to_le64(steal);
+ idx = srcu_read_lock(&kvm->srcu);
+ offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
+ kvm_put_guest(kvm, base + offset, steal_le, u64);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
+long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
+{
+ u32 feature = smccc_get_arg1(vcpu);
+ long val = SMCCC_RET_NOT_SUPPORTED;
+
+ switch (feature) {
+ case ARM_SMCCC_HV_PV_TIME_FEATURES:
+ case ARM_SMCCC_HV_PV_TIME_ST:
+ val = SMCCC_RET_SUCCESS;
+ break;
+ }
+
+ return val;
+}
+
+gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
+{
+ struct pvclock_vcpu_stolen_time init_values = {};
+ struct kvm *kvm = vcpu->kvm;
+ u64 base = vcpu->arch.steal.base;
+ int idx;
+
+ if (base == GPA_INVALID)
+ return base;
+
+ /*
+ * Start counting stolen time from the time the guest requests
+ * the feature enabled.
+ */
+ vcpu->arch.steal.steal = 0;
+ vcpu->arch.steal.last_steal = current->sched_info.run_delay;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_write_guest(kvm, base, &init_values, sizeof(init_values));
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return base;
+}
+
+int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *user = (u64 __user *)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+ u64 ipa;
+ int ret = 0;
+ int idx;
+
+ if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+ return -ENXIO;
+
+ if (get_user(ipa, user))
+ return -EFAULT;
+ if (!IS_ALIGNED(ipa, 64))
+ return -EINVAL;
+ if (vcpu->arch.steal.base != GPA_INVALID)
+ return -EEXIST;
+
+ /* Check the address is in a valid memslot */
+ idx = srcu_read_lock(&kvm->srcu);
+ if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
+ ret = -EINVAL;
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!ret)
+ vcpu->arch.steal.base = ipa;
+
+ return ret;
+}
+
+int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *user = (u64 __user *)attr->addr;
+ u64 ipa;
+
+ if (attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
+ return -ENXIO;
+
+ ipa = vcpu->arch.steal.base;
+
+ if (put_user(ipa, user))
+ return -EFAULT;
+ return 0;
+}
+
+int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->attr) {
+ case KVM_ARM_VCPU_PVTIME_IPA:
+ return 0;
+ }
+ return -ENXIO;
+}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 30b7ea680f66..d3b209023727 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -36,23 +36,11 @@ static u32 kvm_ipa_limit;
/*
* ARMv8 Reset Values
*/
-static const struct kvm_regs default_regs_reset = {
- .regs.pstate = (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT |
- PSR_F_BIT | PSR_D_BIT),
-};
+#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
+ PSR_F_BIT | PSR_D_BIT)
-static const struct kvm_regs default_regs_reset32 = {
- .regs.pstate = (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT |
- PSR_AA32_I_BIT | PSR_AA32_F_BIT),
-};
-
-static bool cpu_has_32bit_el1(void)
-{
- u64 pfr0;
-
- pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
- return !!(pfr0 & 0x20);
-}
+#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
+ PSR_AA32_I_BIT | PSR_AA32_F_BIT)
/**
* kvm_arch_vm_ioctl_check_extension
@@ -66,7 +54,7 @@ int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
switch (ext) {
case KVM_CAP_ARM_EL1_32BIT:
- r = cpu_has_32bit_el1();
+ r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
break;
case KVM_CAP_GUEST_DEBUG_HW_BPS:
r = get_num_brps();
@@ -163,7 +151,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
vl = vcpu->arch.sve_max_vl;
/*
- * Resposibility for these properties is shared between
+ * Responsibility for these properties is shared between
* kvm_arm_init_arch_resources(), kvm_vcpu_enable_sve() and
* set_sve_vls(). Double-check here just to be sure:
*/
@@ -249,7 +237,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
* ioctl or as part of handling a request issued by another VCPU in the PSCI
* handling code. In the first case, the VCPU will not be loaded, and in the
* second case the VCPU will be loaded. Because this function operates purely
- * on the memory-backed valus of system registers, we want to do a full put if
+ * on the memory-backed values of system registers, we want to do a full put if
* we were loaded (handling a request) and load the values back at the end of
* the function. Otherwise we leave the state alone. In both cases, we
* disable preemption around the vcpu reset as we would otherwise race with
@@ -257,9 +245,9 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu)
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
- const struct kvm_regs *cpu_reset;
int ret = -EINVAL;
bool loaded;
+ u32 pstate;
/* Reset PMU outside of the non-preemptible section */
kvm_pmu_vcpu_reset(vcpu);
@@ -288,18 +276,19 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
switch (vcpu->arch.target) {
default:
if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features)) {
- if (!cpu_has_32bit_el1())
+ if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1))
goto out;
- cpu_reset = &default_regs_reset32;
+ pstate = VCPU_RESET_PSTATE_SVC;
} else {
- cpu_reset = &default_regs_reset;
+ pstate = VCPU_RESET_PSTATE_EL1;
}
break;
}
/* Reset core registers */
- memcpy(vcpu_gp_regs(vcpu), cpu_reset, sizeof(*cpu_reset));
+ memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
+ vcpu_gp_regs(vcpu)->regs.pstate = pstate;
/* Reset system registers */
kvm_reset_sys_regs(vcpu);
@@ -340,11 +329,50 @@ out:
return ret;
}
-void kvm_set_ipa_limit(void)
+u32 get_kvm_ipa_limit(void)
+{
+ return kvm_ipa_limit;
+}
+
+int kvm_set_ipa_limit(void)
{
- unsigned int ipa_max, pa_max, va_max, parange;
+ unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+ u64 mmfr0;
+
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ parange = cpuid_feature_extract_unsigned_field(mmfr0,
+ ID_AA64MMFR0_PARANGE_SHIFT);
+
+ /*
+ * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at
+ * Stage-2. If not, things will stop very quickly.
+ */
+ switch (PAGE_SIZE) {
+ default:
+ case SZ_4K:
+ tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT;
+ break;
+ case SZ_16K:
+ tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT;
+ break;
+ case SZ_64K:
+ tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT;
+ break;
+ }
+
+ switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) {
+ default:
+ case 1:
+ kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n");
+ return -EINVAL;
+ case 0:
+ kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n");
+ break;
+ case 2:
+ kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n");
+ break;
+ }
- parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
/* Clamp the IPA limit to the PA size supported by the kernel */
@@ -357,7 +385,7 @@ void kvm_set_ipa_limit(void)
*
* So clamp the ipa limit further down to limit the number of levels.
* Since we can concatenate upto 16 tables at entry level, we could
- * go upto 4bits above the maximum VA addressible with the current
+ * go upto 4bits above the maximum VA addressable with the current
* number of levels.
*/
va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
@@ -378,6 +406,8 @@ void kvm_set_ipa_limit(void)
"KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
kvm_ipa_limit = ipa_max;
kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+
+ return 0;
}
/*
@@ -390,7 +420,7 @@ void kvm_set_ipa_limit(void)
*/
int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
{
- u64 vtcr = VTCR_EL2_FLAGS;
+ u64 vtcr = VTCR_EL2_FLAGS, mmfr0;
u32 parange, phys_shift;
u8 lvls;
@@ -406,7 +436,9 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
phys_shift = KVM_PHYS_SHIFT;
}
- parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
+ mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
+ parange = cpuid_feature_extract_unsigned_field(mmfr0,
+ ID_AA64MMFR0_PARANGE_SHIFT);
if (parange > ID_AA64MMFR0_PARANGE_MAX)
parange = ID_AA64MMFR0_PARANGE_MAX;
vtcr |= parange << VTCR_EL2_PS_SHIFT;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 51db934702b6..80985439bfb2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -34,7 +34,7 @@
#include "trace.h"
/*
- * All of this file is extremly similar to the ARM coproc.c, but the
+ * All of this file is extremely similar to the ARM coproc.c, but the
* types are different. My gut feeling is that it should be pretty
* easy to merge, but that would be an ABI breakage -- again. VFP
* would also need to be abstracted.
@@ -64,11 +64,8 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu,
return false;
}
-u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+static bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
{
- if (!vcpu->arch.sysregs_loaded_on_cpu)
- goto immediate_read;
-
/*
* System registers listed in the switch are not saved on every
* exit from the guest but are only saved on vcpu_put.
@@ -79,75 +76,92 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
* thread when emulating cross-VCPU communication.
*/
switch (reg) {
- case CSSELR_EL1: return read_sysreg_s(SYS_CSSELR_EL1);
- case SCTLR_EL1: return read_sysreg_s(SYS_SCTLR_EL12);
- case ACTLR_EL1: return read_sysreg_s(SYS_ACTLR_EL1);
- case CPACR_EL1: return read_sysreg_s(SYS_CPACR_EL12);
- case TTBR0_EL1: return read_sysreg_s(SYS_TTBR0_EL12);
- case TTBR1_EL1: return read_sysreg_s(SYS_TTBR1_EL12);
- case TCR_EL1: return read_sysreg_s(SYS_TCR_EL12);
- case ESR_EL1: return read_sysreg_s(SYS_ESR_EL12);
- case AFSR0_EL1: return read_sysreg_s(SYS_AFSR0_EL12);
- case AFSR1_EL1: return read_sysreg_s(SYS_AFSR1_EL12);
- case FAR_EL1: return read_sysreg_s(SYS_FAR_EL12);
- case MAIR_EL1: return read_sysreg_s(SYS_MAIR_EL12);
- case VBAR_EL1: return read_sysreg_s(SYS_VBAR_EL12);
- case CONTEXTIDR_EL1: return read_sysreg_s(SYS_CONTEXTIDR_EL12);
- case TPIDR_EL0: return read_sysreg_s(SYS_TPIDR_EL0);
- case TPIDRRO_EL0: return read_sysreg_s(SYS_TPIDRRO_EL0);
- case TPIDR_EL1: return read_sysreg_s(SYS_TPIDR_EL1);
- case AMAIR_EL1: return read_sysreg_s(SYS_AMAIR_EL12);
- case CNTKCTL_EL1: return read_sysreg_s(SYS_CNTKCTL_EL12);
- case PAR_EL1: return read_sysreg_s(SYS_PAR_EL1);
- case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2);
- case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2);
- case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2);
+ case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break;
+ case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break;
+ case ACTLR_EL1: *val = read_sysreg_s(SYS_ACTLR_EL1); break;
+ case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break;
+ case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break;
+ case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break;
+ case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break;
+ case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break;
+ case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break;
+ case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break;
+ case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
+ case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break;
+ case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break;
+ case PAR_EL1: *val = read_sysreg_s(SYS_PAR_EL1); break;
+ case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break;
+ case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break;
+ default: return false;
}
-immediate_read:
- return __vcpu_sys_reg(vcpu, reg);
+ return true;
}
-void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+static bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
{
- if (!vcpu->arch.sysregs_loaded_on_cpu)
- goto immediate_write;
-
/*
* System registers listed in the switch are not restored on every
* entry to the guest but are only restored on vcpu_load.
*
* Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
- * should never be listed below, because the the MPIDR should only be
- * set once, before running the VCPU, and never changed later.
+ * should never be listed below, because the MPIDR should only be set
+ * once, before running the VCPU, and never changed later.
*/
switch (reg) {
- case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); return;
- case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); return;
- case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); return;
- case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); return;
- case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); return;
- case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); return;
- case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); return;
- case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); return;
- case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); return;
- case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); return;
- case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); return;
- case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); return;
- case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); return;
- case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12); return;
- case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); return;
- case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); return;
- case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); return;
- case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); return;
- case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); return;
- case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); return;
- case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return;
- case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return;
- case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return;
+ case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break;
+ case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
+ case ACTLR_EL1: write_sysreg_s(val, SYS_ACTLR_EL1); break;
+ case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
+ case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
+ case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break;
+ case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break;
+ case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break;
+ case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break;
+ case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break;
+ case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break;
+ case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break;
+ case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break;
+ case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
+ case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break;
+ case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break;
+ case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break;
+ case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break;
+ case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break;
+ case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break;
+ case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break;
+ case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break;
+ case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break;
+ default: return false;
}
-immediate_write:
+ return true;
+}
+
+u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
+{
+ u64 val = 0x8badf00d8badf00d;
+
+ if (vcpu->arch.sysregs_loaded_on_cpu &&
+ __vcpu_read_sys_reg_from_cpu(reg, &val))
+ return val;
+
+ return __vcpu_sys_reg(vcpu, reg);
+}
+
+void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
+{
+ if (vcpu->arch.sysregs_loaded_on_cpu &&
+ __vcpu_write_sys_reg_to_cpu(val, reg))
+ return;
+
__vcpu_sys_reg(vcpu, reg) = val;
}
@@ -1456,9 +1470,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_SANITISED(MVFR1_EL1),
ID_SANITISED(MVFR2_EL1),
ID_UNALLOCATED(3,3),
- ID_UNALLOCATED(3,4),
- ID_UNALLOCATED(3,5),
- ID_UNALLOCATED(3,6),
+ ID_SANITISED(ID_PFR2_EL1),
+ ID_HIDDEN(ID_DFR1_EL1),
+ ID_SANITISED(ID_MMFR5_EL1),
ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
@@ -1532,7 +1546,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },
{ SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
- { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 },
+ { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 },
{ SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
{ SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },
@@ -1571,8 +1585,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, PMCR_EL0 },
{ SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
- { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 },
- { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 },
+ { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
+ { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 },
{ SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 },
{ SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 },
{ SYS_DESC(SYS_PMCEID0_EL0), access_pmceid },
@@ -2073,12 +2087,37 @@ static const struct sys_reg_desc cp15_64_regs[] = {
{ SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer },
};
+static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
+ bool is_32)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; i++) {
+ if (!is_32 && table[i].reg && !table[i].reset) {
+ kvm_err("sys_reg table %p entry %d has lacks reset\n",
+ table, i);
+ return 1;
+ }
+
+ if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
+ kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
/* Target specific emulation tables */
static struct kvm_sys_reg_target_table *target_tables[KVM_ARM_NUM_TARGETS];
void kvm_register_target_sys_reg_table(unsigned int target,
struct kvm_sys_reg_target_table *table)
{
+ if (check_sysreg_table(table->table64.table, table->table64.num, false) ||
+ check_sysreg_table(table->table32.table, table->table32.num, true))
+ return;
+
target_tables[target] = table;
}
@@ -2364,19 +2403,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu,
}
static void reset_sys_reg_descs(struct kvm_vcpu *vcpu,
- const struct sys_reg_desc *table, size_t num,
- unsigned long *bmap)
+ const struct sys_reg_desc *table, size_t num)
{
unsigned long i;
for (i = 0; i < num; i++)
- if (table[i].reset) {
- int reg = table[i].reg;
-
+ if (table[i].reset)
table[i].reset(vcpu, &table[i]);
- if (reg > 0 && reg < NR_SYS_REGS)
- set_bit(reg, bmap);
- }
}
/**
@@ -2832,32 +2865,18 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
return write_demux_regids(uindices);
}
-static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n)
-{
- unsigned int i;
-
- for (i = 1; i < n; i++) {
- if (cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
- kvm_err("sys_reg table %p out of order (%d)\n", table, i - 1);
- return 1;
- }
- }
-
- return 0;
-}
-
void kvm_sys_reg_table_init(void)
{
unsigned int i;
struct sys_reg_desc clidr;
/* Make sure tables are unique and in order. */
- BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs)));
- BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs)));
- BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs)));
- BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs)));
- BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs)));
- BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)));
+ BUG_ON(check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false));
+ BUG_ON(check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true));
+ BUG_ON(check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true));
+ BUG_ON(check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true));
+ BUG_ON(check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true));
+ BUG_ON(check_sysreg_table(invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs), false));
/* We abuse the reset function to overwrite the table itself. */
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
@@ -2893,17 +2912,10 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
size_t num;
const struct sys_reg_desc *table;
- DECLARE_BITMAP(bmap, NR_SYS_REGS) = { 0, };
/* Generic chip reset first (so target could override). */
- reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs), bmap);
+ reset_sys_reg_descs(vcpu, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
table = get_target_table(vcpu->arch.target, true, &num);
- reset_sys_reg_descs(vcpu, table, num, bmap);
-
- for (num = 1; num < NR_SYS_REGS; num++) {
- if (WARN(!test_bit(num, bmap),
- "Didn't reset __vcpu_sys_reg(%zi)\n", num))
- break;
- }
+ reset_sys_reg_descs(vcpu, table, num);
}
diff --git a/arch/arm64/kvm/trace.h b/arch/arm64/kvm/trace.h
index eab91ad0effb..86f9ea47be29 100644
--- a/arch/arm64/kvm/trace.h
+++ b/arch/arm64/kvm/trace.h
@@ -1,216 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#ifndef _TRACE_ARM64_KVM_H
#define _TRACE_ARM64_KVM_H
-#include <linux/tracepoint.h>
-#include "sys_regs.h"
+#include "trace_arm.h"
+#include "trace_handle_exit.h"
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kvm
-
-TRACE_EVENT(kvm_wfx_arm64,
- TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
- TP_ARGS(vcpu_pc, is_wfe),
-
- TP_STRUCT__entry(
- __field(unsigned long, vcpu_pc)
- __field(bool, is_wfe)
- ),
-
- TP_fast_assign(
- __entry->vcpu_pc = vcpu_pc;
- __entry->is_wfe = is_wfe;
- ),
-
- TP_printk("guest executed wf%c at: 0x%08lx",
- __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
-);
-
-TRACE_EVENT(kvm_hvc_arm64,
- TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
- TP_ARGS(vcpu_pc, r0, imm),
-
- TP_STRUCT__entry(
- __field(unsigned long, vcpu_pc)
- __field(unsigned long, r0)
- __field(unsigned long, imm)
- ),
-
- TP_fast_assign(
- __entry->vcpu_pc = vcpu_pc;
- __entry->r0 = r0;
- __entry->imm = imm;
- ),
-
- TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
- __entry->vcpu_pc, __entry->r0, __entry->imm)
-);
-
-TRACE_EVENT(kvm_arm_setup_debug,
- TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
- TP_ARGS(vcpu, guest_debug),
-
- TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_clear_debug,
- TP_PROTO(__u32 guest_debug),
- TP_ARGS(guest_debug),
-
- TP_STRUCT__entry(
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("flags: 0x%08x", __entry->guest_debug)
-);
-
-TRACE_EVENT(kvm_arm_set_dreg32,
- TP_PROTO(const char *name, __u32 value),
- TP_ARGS(name, value),
-
- TP_STRUCT__entry(
- __field(const char *, name)
- __field(__u32, value)
- ),
-
- TP_fast_assign(
- __entry->name = name;
- __entry->value = value;
- ),
-
- TP_printk("%s: 0x%08x", __entry->name, __entry->value)
-);
-
-TRACE_DEFINE_SIZEOF(__u64);
-
-TRACE_EVENT(kvm_arm_set_regset,
- TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
- TP_ARGS(type, len, control, value),
- TP_STRUCT__entry(
- __field(const char *, name)
- __field(int, len)
- __array(u64, ctrls, 16)
- __array(u64, values, 16)
- ),
- TP_fast_assign(
- __entry->name = type;
- __entry->len = len;
- memcpy(__entry->ctrls, control, len << 3);
- memcpy(__entry->values, value, len << 3);
- ),
- TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
- __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
- __print_array(__entry->values, __entry->len, sizeof(__u64)))
-);
-
-TRACE_EVENT(trap_reg,
- TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
- TP_ARGS(fn, reg, is_write, write_value),
-
- TP_STRUCT__entry(
- __field(const char *, fn)
- __field(int, reg)
- __field(bool, is_write)
- __field(u64, write_value)
- ),
-
- TP_fast_assign(
- __entry->fn = fn;
- __entry->reg = reg;
- __entry->is_write = is_write;
- __entry->write_value = write_value;
- ),
-
- TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
-);
-
-TRACE_EVENT(kvm_handle_sys_reg,
- TP_PROTO(unsigned long hsr),
- TP_ARGS(hsr),
-
- TP_STRUCT__entry(
- __field(unsigned long, hsr)
- ),
-
- TP_fast_assign(
- __entry->hsr = hsr;
- ),
-
- TP_printk("HSR 0x%08lx", __entry->hsr)
-);
-
-TRACE_EVENT(kvm_sys_access,
- TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
- TP_ARGS(vcpu_pc, params, reg),
-
- TP_STRUCT__entry(
- __field(unsigned long, vcpu_pc)
- __field(bool, is_write)
- __field(const char *, name)
- __field(u8, Op0)
- __field(u8, Op1)
- __field(u8, CRn)
- __field(u8, CRm)
- __field(u8, Op2)
- ),
-
- TP_fast_assign(
- __entry->vcpu_pc = vcpu_pc;
- __entry->is_write = params->is_write;
- __entry->name = reg->name;
- __entry->Op0 = reg->Op0;
- __entry->Op0 = reg->Op0;
- __entry->Op1 = reg->Op1;
- __entry->CRn = reg->CRn;
- __entry->CRm = reg->CRm;
- __entry->Op2 = reg->Op2;
- ),
-
- TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
- __entry->vcpu_pc, __entry->name ?: "UNKN",
- __entry->Op0, __entry->Op1, __entry->CRn,
- __entry->CRm, __entry->Op2,
- __entry->is_write ? "write" : "read")
-);
-
-TRACE_EVENT(kvm_set_guest_debug,
- TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
- TP_ARGS(vcpu, guest_debug),
-
- TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(__u32, guest_debug)
- ),
-
- TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->guest_debug = guest_debug;
- ),
-
- TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
-);
-
-
-#endif /* _TRACE_ARM64_KVM_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
+#endif /* _TRACE_ARM64_KVM_H */
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
new file mode 100644
index 000000000000..4c71270cc097
--- /dev/null
+++ b/arch/arm64/kvm/trace_arm.h
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM_ARM64_KVM_H
+
+#include <kvm/arm_arch_timer.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoints for entry/exit to guest
+ */
+TRACE_EVENT(kvm_entry,
+ TP_PROTO(unsigned long vcpu_pc),
+ TP_ARGS(vcpu_pc),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ ),
+
+ TP_printk("PC: 0x%08lx", __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
+ TP_ARGS(ret, esr_ec, vcpu_pc),
+
+ TP_STRUCT__entry(
+ __field( int, ret )
+ __field( unsigned int, esr_ec )
+ __field( unsigned long, vcpu_pc )
+ ),
+
+ TP_fast_assign(
+ __entry->ret = ARM_EXCEPTION_CODE(ret);
+ __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
+ __entry->vcpu_pc = vcpu_pc;
+ ),
+
+ TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+ __print_symbolic(__entry->ret, kvm_arm_exception_type),
+ __entry->esr_ec,
+ __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
+ __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_guest_fault,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
+ unsigned long hxfar,
+ unsigned long long ipa),
+ TP_ARGS(vcpu_pc, hsr, hxfar, ipa),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( unsigned long, hsr )
+ __field( unsigned long, hxfar )
+ __field( unsigned long long, ipa )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->hsr = hsr;
+ __entry->hxfar = hxfar;
+ __entry->ipa = ipa;
+ ),
+
+ TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#08lx",
+ __entry->ipa, __entry->hsr,
+ __entry->hxfar, __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_access_fault,
+ TP_PROTO(unsigned long ipa),
+ TP_ARGS(ipa),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ipa )
+ ),
+
+ TP_fast_assign(
+ __entry->ipa = ipa;
+ ),
+
+ TP_printk("IPA: %lx", __entry->ipa)
+);
+
+TRACE_EVENT(kvm_irq_line,
+ TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
+ TP_ARGS(type, vcpu_idx, irq_num, level),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, type )
+ __field( int, vcpu_idx )
+ __field( int, irq_num )
+ __field( int, level )
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->vcpu_idx = vcpu_idx;
+ __entry->irq_num = irq_num;
+ __entry->level = level;
+ ),
+
+ TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
+ (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
+ (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
+ (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
+ __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
+);
+
+TRACE_EVENT(kvm_mmio_emulate,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
+ unsigned long cpsr),
+ TP_ARGS(vcpu_pc, instr, cpsr),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( unsigned long, instr )
+ __field( unsigned long, cpsr )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->instr = instr;
+ __entry->cpsr = cpsr;
+ ),
+
+ TP_printk("Emulate MMIO at: 0x%08lx (instr: %08lx, cpsr: %08lx)",
+ __entry->vcpu_pc, __entry->instr, __entry->cpsr)
+);
+
+TRACE_EVENT(kvm_unmap_hva_range,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, start )
+ __field( unsigned long, end )
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ ),
+
+ TP_printk("mmu notifier unmap range: %#08lx -- %#08lx",
+ __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_set_spte_hva,
+ TP_PROTO(unsigned long hva),
+ TP_ARGS(hva),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, hva )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ ),
+
+ TP_printk("mmu notifier set pte hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_age_hva,
+ TP_PROTO(unsigned long start, unsigned long end),
+ TP_ARGS(start, end),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, start )
+ __field( unsigned long, end )
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ ),
+
+ TP_printk("mmu notifier age hva: %#08lx -- %#08lx",
+ __entry->start, __entry->end)
+);
+
+TRACE_EVENT(kvm_test_age_hva,
+ TP_PROTO(unsigned long hva),
+ TP_ARGS(hva),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, hva )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ ),
+
+ TP_printk("mmu notifier test age hva: %#08lx", __entry->hva)
+);
+
+TRACE_EVENT(kvm_set_way_flush,
+ TP_PROTO(unsigned long vcpu_pc, bool cache),
+ TP_ARGS(vcpu_pc, cache),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( bool, cache )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->cache = cache;
+ ),
+
+ TP_printk("S/W flush at 0x%016lx (cache %s)",
+ __entry->vcpu_pc, __entry->cache ? "on" : "off")
+);
+
+TRACE_EVENT(kvm_toggle_cache,
+ TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
+ TP_ARGS(vcpu_pc, was, now),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_pc )
+ __field( bool, was )
+ __field( bool, now )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->was = was;
+ __entry->now = now;
+ ),
+
+ TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
+ __entry->vcpu_pc, __entry->was ? "on" : "off",
+ __entry->now ? "on" : "off")
+);
+
+/*
+ * Tracepoints for arch_timer
+ */
+TRACE_EVENT(kvm_timer_update_irq,
+ TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
+ TP_ARGS(vcpu_id, irq, level),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_id )
+ __field( __u32, irq )
+ __field( int, level )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->irq = irq;
+ __entry->level = level;
+ ),
+
+ TP_printk("VCPU: %ld, IRQ %d, level %d",
+ __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+TRACE_EVENT(kvm_get_timer_map,
+ TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
+ TP_ARGS(vcpu_id, map),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_id )
+ __field( int, direct_vtimer )
+ __field( int, direct_ptimer )
+ __field( int, emul_ptimer )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer);
+ __entry->direct_ptimer =
+ (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+ __entry->emul_ptimer =
+ (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
+ ),
+
+ TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+ __entry->vcpu_id,
+ __entry->direct_vtimer,
+ __entry->direct_ptimer,
+ __entry->emul_ptimer)
+);
+
+TRACE_EVENT(kvm_timer_save_state,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ctl )
+ __field( unsigned long long, cval )
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->ctl = ctx->cnt_ctl;
+ __entry->cval = ctx->cnt_cval;
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+ __entry->ctl,
+ __entry->cval,
+ __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_restore_state,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ctl )
+ __field( unsigned long long, cval )
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->ctl = ctx->cnt_ctl;
+ __entry->cval = ctx->cnt_cval;
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+ __entry->ctl,
+ __entry->cval,
+ __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_hrtimer_expire,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_emulate,
+ TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
+ TP_ARGS(ctx, should_fire),
+
+ TP_STRUCT__entry(
+ __field( int, timer_idx )
+ __field( bool, should_fire )
+ ),
+
+ TP_fast_assign(
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ __entry->should_fire = should_fire;
+ ),
+
+ TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
+ __entry->timer_idx, __entry->should_fire)
+);
+
+#endif /* _TRACE_ARM_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_arm
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h
new file mode 100644
index 000000000000..2c56d1e0f5bd
--- /dev/null
+++ b/arch/arm64/kvm/trace_handle_exit.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HANDLE_EXIT_ARM64_KVM_H
+
+#include <linux/tracepoint.h>
+#include "sys_regs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(kvm_wfx_arm64,
+ TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
+ TP_ARGS(vcpu_pc, is_wfe),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, vcpu_pc)
+ __field(bool, is_wfe)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->is_wfe = is_wfe;
+ ),
+
+ TP_printk("guest executed wf%c at: 0x%08lx",
+ __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
+);
+
+TRACE_EVENT(kvm_hvc_arm64,
+ TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
+ TP_ARGS(vcpu_pc, r0, imm),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, vcpu_pc)
+ __field(unsigned long, r0)
+ __field(unsigned long, imm)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->r0 = r0;
+ __entry->imm = imm;
+ ),
+
+ TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx)",
+ __entry->vcpu_pc, __entry->r0, __entry->imm)
+);
+
+TRACE_EVENT(kvm_arm_setup_debug,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+ TP_ARGS(vcpu, guest_debug),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(__u32, guest_debug)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->guest_debug = guest_debug;
+ ),
+
+ TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+ TP_PROTO(__u32 guest_debug),
+ TP_ARGS(guest_debug),
+
+ TP_STRUCT__entry(
+ __field(__u32, guest_debug)
+ ),
+
+ TP_fast_assign(
+ __entry->guest_debug = guest_debug;
+ ),
+
+ TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+ TP_PROTO(const char *name, __u32 value),
+ TP_ARGS(name, value),
+
+ TP_STRUCT__entry(
+ __field(const char *, name)
+ __field(__u32, value)
+ ),
+
+ TP_fast_assign(
+ __entry->name = name;
+ __entry->value = value;
+ ),
+
+ TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_DEFINE_SIZEOF(__u64);
+
+TRACE_EVENT(kvm_arm_set_regset,
+ TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+ TP_ARGS(type, len, control, value),
+ TP_STRUCT__entry(
+ __field(const char *, name)
+ __field(int, len)
+ __array(u64, ctrls, 16)
+ __array(u64, values, 16)
+ ),
+ TP_fast_assign(
+ __entry->name = type;
+ __entry->len = len;
+ memcpy(__entry->ctrls, control, len << 3);
+ memcpy(__entry->values, value, len << 3);
+ ),
+ TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+ __print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+ __print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+ TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+ TP_ARGS(fn, reg, is_write, write_value),
+
+ TP_STRUCT__entry(
+ __field(const char *, fn)
+ __field(int, reg)
+ __field(bool, is_write)
+ __field(u64, write_value)
+ ),
+
+ TP_fast_assign(
+ __entry->fn = fn;
+ __entry->reg = reg;
+ __entry->is_write = is_write;
+ __entry->write_value = write_value;
+ ),
+
+ TP_printk("%s %s reg %d (0x%08llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+ TP_PROTO(unsigned long hsr),
+ TP_ARGS(hsr),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, hsr)
+ ),
+
+ TP_fast_assign(
+ __entry->hsr = hsr;
+ ),
+
+ TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_sys_access,
+ TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
+ TP_ARGS(vcpu_pc, params, reg),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, vcpu_pc)
+ __field(bool, is_write)
+ __field(const char *, name)
+ __field(u8, Op0)
+ __field(u8, Op1)
+ __field(u8, CRn)
+ __field(u8, CRm)
+ __field(u8, Op2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_pc = vcpu_pc;
+ __entry->is_write = params->is_write;
+ __entry->name = reg->name;
+ __entry->Op0 = reg->Op0;
+ __entry->Op0 = reg->Op0;
+ __entry->Op1 = reg->Op1;
+ __entry->CRn = reg->CRn;
+ __entry->CRm = reg->CRm;
+ __entry->Op2 = reg->Op2;
+ ),
+
+ TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
+ __entry->vcpu_pc, __entry->name ?: "UNKN",
+ __entry->Op0, __entry->Op1, __entry->CRn,
+ __entry->CRm, __entry->Op2,
+ __entry->is_write ? "write" : "read")
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+ TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+ TP_ARGS(vcpu, guest_debug),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(__u32, guest_debug)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->guest_debug = guest_debug;
+ ),
+
+ TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_handle_exit
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
index e7d1ea92095d..2f92bdcb1188 100644
--- a/arch/arm64/kvm/vgic-sys-reg-v3.c
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -7,7 +7,7 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
-#include "vgic.h"
+#include "vgic/vgic.h"
#include "sys_regs.h"
static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
diff --git a/arch/arm64/kvm/vgic/trace.h b/arch/arm64/kvm/vgic/trace.h
new file mode 100644
index 000000000000..83c64401a7fc
--- /dev/null
+++ b/arch/arm64/kvm/vgic/trace.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VGIC_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+TRACE_EVENT(vgic_update_irq_pending,
+ TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
+ TP_ARGS(vcpu_id, irq, level),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_id )
+ __field( __u32, irq )
+ __field( bool, level )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->irq = irq;
+ __entry->level = level;
+ ),
+
+ TP_printk("VCPU: %ld, IRQ %d, level: %d",
+ __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+#endif /* _TRACE_VGIC_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
new file mode 100644
index 000000000000..b13a9e3f99dd
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Linaro
+ * Author: Christoffer Dall <christoffer.dall@linaro.org>
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/interrupt.h>
+#include <linux/kvm_host.h>
+#include <linux/seq_file.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Structure to control looping through the entire vgic state. We start at
+ * zero for each field and move upwards. So, if dist_id is 0 we print the
+ * distributor info. When dist_id is 1, we have already printed it and move
+ * on.
+ *
+ * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
+ * so on.
+ */
+struct vgic_state_iter {
+ int nr_cpus;
+ int nr_spis;
+ int nr_lpis;
+ int dist_id;
+ int vcpu_id;
+ int intid;
+ int lpi_idx;
+ u32 *lpi_array;
+};
+
+static void iter_next(struct vgic_state_iter *iter)
+{
+ if (iter->dist_id == 0) {
+ iter->dist_id++;
+ return;
+ }
+
+ iter->intid++;
+ if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
+ ++iter->vcpu_id < iter->nr_cpus)
+ iter->intid = 0;
+
+ if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ if (iter->lpi_idx < iter->nr_lpis)
+ iter->intid = iter->lpi_array[iter->lpi_idx];
+ iter->lpi_idx++;
+ }
+}
+
+static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
+ loff_t pos)
+{
+ int nr_cpus = atomic_read(&kvm->online_vcpus);
+
+ memset(iter, 0, sizeof(*iter));
+
+ iter->nr_cpus = nr_cpus;
+ iter->nr_spis = kvm->arch.vgic.nr_spis;
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
+ if (iter->nr_lpis < 0)
+ iter->nr_lpis = 0;
+ }
+
+ /* Fast forward to the right position if needed */
+ while (pos--)
+ iter_next(iter);
+}
+
+static bool end_of_vgic(struct vgic_state_iter *iter)
+{
+ return iter->dist_id > 0 &&
+ iter->vcpu_id == iter->nr_cpus &&
+ iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
+ iter->lpi_idx > iter->nr_lpis;
+}
+
+static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
+{
+ struct kvm *kvm = (struct kvm *)s->private;
+ struct vgic_state_iter *iter;
+
+ mutex_lock(&kvm->lock);
+ iter = kvm->arch.vgic.iter;
+ if (iter) {
+ iter = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ iter = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ iter_init(kvm, iter, *pos);
+ kvm->arch.vgic.iter = iter;
+
+ if (end_of_vgic(iter))
+ iter = NULL;
+out:
+ mutex_unlock(&kvm->lock);
+ return iter;
+}
+
+static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct kvm *kvm = (struct kvm *)s->private;
+ struct vgic_state_iter *iter = kvm->arch.vgic.iter;
+
+ ++*pos;
+ iter_next(iter);
+ if (end_of_vgic(iter))
+ iter = NULL;
+ return iter;
+}
+
+static void vgic_debug_stop(struct seq_file *s, void *v)
+{
+ struct kvm *kvm = (struct kvm *)s->private;
+ struct vgic_state_iter *iter;
+
+ /*
+ * If the seq file wasn't properly opened, there's nothing to clearn
+ * up.
+ */
+ if (IS_ERR(v))
+ return;
+
+ mutex_lock(&kvm->lock);
+ iter = kvm->arch.vgic.iter;
+ kfree(iter->lpi_array);
+ kfree(iter);
+ kvm->arch.vgic.iter = NULL;
+ mutex_unlock(&kvm->lock);
+}
+
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+{
+ bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
+
+ seq_printf(s, "Distributor\n");
+ seq_printf(s, "===========\n");
+ seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
+ seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
+ if (v3)
+ seq_printf(s, "nr_lpis:\t%d\n", dist->lpi_list_count);
+ seq_printf(s, "enabled:\t%d\n", dist->enabled);
+ seq_printf(s, "\n");
+
+ seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
+ seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
+ seq_printf(s, "G=group\n");
+}
+
+static void print_header(struct seq_file *s, struct vgic_irq *irq,
+ struct kvm_vcpu *vcpu)
+{
+ int id = 0;
+ char *hdr = "SPI ";
+
+ if (vcpu) {
+ hdr = "VCPU";
+ id = vcpu->vcpu_id;
+ }
+
+ seq_printf(s, "\n");
+ seq_printf(s, "%s%2d TYP ID TGT_ID PLAEHCG HWID TARGET SRC PRI VCPU_ID\n", hdr, id);
+ seq_printf(s, "----------------------------------------------------------------\n");
+}
+
+static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
+ struct kvm_vcpu *vcpu)
+{
+ char *type;
+ bool pending;
+
+ if (irq->intid < VGIC_NR_SGIS)
+ type = "SGI";
+ else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
+ type = "PPI";
+ else if (irq->intid < VGIC_MAX_SPI)
+ type = "SPI";
+ else
+ type = "LPI";
+
+ if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
+ print_header(s, irq, vcpu);
+
+ pending = irq->pending_latch;
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ int err;
+
+ err = irq_get_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ &pending);
+ WARN_ON_ONCE(err);
+ }
+
+ seq_printf(s, " %s %4d "
+ " %2d "
+ "%d%d%d%d%d%d%d "
+ "%8d "
+ "%8x "
+ " %2x "
+ "%3d "
+ " %2d "
+ "\n",
+ type, irq->intid,
+ (irq->target_vcpu) ? irq->target_vcpu->vcpu_id : -1,
+ pending,
+ irq->line_level,
+ irq->active,
+ irq->enabled,
+ irq->hw,
+ irq->config == VGIC_CONFIG_LEVEL,
+ irq->group,
+ irq->hwintid,
+ irq->mpidr,
+ irq->source,
+ irq->priority,
+ (irq->vcpu) ? irq->vcpu->vcpu_id : -1);
+}
+
+static int vgic_debug_show(struct seq_file *s, void *v)
+{
+ struct kvm *kvm = (struct kvm *)s->private;
+ struct vgic_state_iter *iter = (struct vgic_state_iter *)v;
+ struct vgic_irq *irq;
+ struct kvm_vcpu *vcpu = NULL;
+ unsigned long flags;
+
+ if (iter->dist_id == 0) {
+ print_dist_state(s, &kvm->arch.vgic);
+ return 0;
+ }
+
+ if (!kvm->arch.vgic.initialized)
+ return 0;
+
+ if (iter->vcpu_id < iter->nr_cpus)
+ vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
+
+ irq = vgic_get_irq(kvm, vcpu, iter->intid);
+ if (!irq) {
+ seq_printf(s, " LPI %4d freed\n", iter->intid);
+ return 0;
+ }
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ print_irq_state(s, irq, vcpu);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(kvm, irq);
+ return 0;
+}
+
+static const struct seq_operations vgic_debug_seq_ops = {
+ .start = vgic_debug_start,
+ .next = vgic_debug_next,
+ .stop = vgic_debug_stop,
+ .show = vgic_debug_show
+};
+
+static int debug_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ ret = seq_open(file, &vgic_debug_seq_ops);
+ if (!ret) {
+ struct seq_file *seq;
+ /* seq_open will have modified file->private_data */
+ seq = file->private_data;
+ seq->private = inode->i_private;
+ }
+
+ return ret;
+};
+
+static const struct file_operations vgic_debug_fops = {
+ .owner = THIS_MODULE,
+ .open = debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+void vgic_debug_init(struct kvm *kvm)
+{
+ debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
+ &vgic_debug_fops);
+}
+
+void vgic_debug_destroy(struct kvm *kvm)
+{
+}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
new file mode 100644
index 000000000000..32e32d67a127
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/uaccess.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
+#include "vgic.h"
+
+/*
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces. The basic
+ * idea is that even though the VGIC is not functional or not requested from
+ * user space, the critical path of the run loop can still call VGIC functions
+ * that just won't do anything, without them having to check additional
+ * initialization flags to ensure they don't look at uninitialized data
+ * structures.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ * depend on any sizing information or emulation type. No allocation
+ * is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ * structures that depend on sizing information (number of CPUs,
+ * number of interrupts). Also initializes the vcpu specific data
+ * structures. Can be executed lazily for GICv2.
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_vcpu_init(): initialization of static data that
+ * doesn't depend on any sizing information or emulation type. No
+ * allocation is allowed there.
+ */
+
+/* EARLY INIT */
+
+/**
+ * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
+ * @kvm: The VM whose VGIC districutor should be initialized
+ *
+ * Only do initialization of static structures that don't require any
+ * allocation or sizing information from userspace. vgic_init() called
+ * kvm_vgic_dist_init() which takes care of the rest.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ INIT_LIST_HEAD(&dist->lpi_list_head);
+ INIT_LIST_HEAD(&dist->lpi_translation_cache);
+ raw_spin_lock_init(&dist->lpi_list_lock);
+}
+
+/* CREATION */
+
+/**
+ * kvm_vgic_create: triggered by the instantiation of the VGIC device by
+ * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
+ * or through the generic KVM_CREATE_DEVICE API ioctl.
+ * irqchip_in_kernel() tells you if this function succeeded or not.
+ * @kvm: kvm struct pointer
+ * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
+ */
+int kvm_vgic_create(struct kvm *kvm, u32 type)
+{
+ int i, ret;
+ struct kvm_vcpu *vcpu;
+
+ if (irqchip_in_kernel(kvm))
+ return -EEXIST;
+
+ /*
+ * This function is also called by the KVM_CREATE_IRQCHIP handler,
+ * which had no chance yet to check the availability of the GICv2
+ * emulation. So check this here again. KVM_CREATE_DEVICE does
+ * the proper checks already.
+ */
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+ !kvm_vgic_global_state.can_emulate_gicv2)
+ return -ENODEV;
+
+ ret = -EBUSY;
+ if (!lock_all_vcpus(kvm))
+ return ret;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.has_run_once)
+ goto out_unlock;
+ }
+ ret = 0;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+ else
+ kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+
+ if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+ ret = -E2BIG;
+ goto out_unlock;
+ }
+
+ kvm->arch.vgic.in_kernel = true;
+ kvm->arch.vgic.vgic_model = type;
+
+ kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
+ else
+ INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+
+out_unlock:
+ unlock_all_vcpus(kvm);
+ return ret;
+}
+
+/* INIT/DESTROY */
+
+/**
+ * kvm_vgic_dist_init: initialize the dist data structures
+ * @kvm: kvm struct pointer
+ * @nr_spis: number of spis, frozen by caller
+ */
+static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+ int i;
+
+ dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
+ if (!dist->spis)
+ return -ENOMEM;
+
+ /*
+ * In the following code we do not take the irq struct lock since
+ * no other action on irq structs can happen while the VGIC is
+ * not initialized yet:
+ * If someone wants to inject an interrupt or does a MMIO access, we
+ * require prior initialization in case of a virtual GICv3 or trigger
+ * initialization when using a virtual GICv2.
+ */
+ for (i = 0; i < nr_spis; i++) {
+ struct vgic_irq *irq = &dist->spis[i];
+
+ irq->intid = i + VGIC_NR_PRIVATE_IRQS;
+ INIT_LIST_HEAD(&irq->ap_list);
+ raw_spin_lock_init(&irq->irq_lock);
+ irq->vcpu = NULL;
+ irq->target_vcpu = vcpu0;
+ kref_init(&irq->refcount);
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ irq->targets = 0;
+ irq->group = 0;
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ irq->mpidr = 0;
+ irq->group = 1;
+ break;
+ default:
+ kfree(dist->spis);
+ dist->spis = NULL;
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ int ret = 0;
+ int i;
+
+ vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+ raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+ atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
+ /*
+ * Enable and configure all SGIs to be edge-triggered and
+ * configure all PPIs as level-triggered.
+ */
+ for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+ struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+
+ INIT_LIST_HEAD(&irq->ap_list);
+ raw_spin_lock_init(&irq->irq_lock);
+ irq->intid = i;
+ irq->vcpu = NULL;
+ irq->target_vcpu = vcpu;
+ kref_init(&irq->refcount);
+ if (vgic_irq_is_sgi(i)) {
+ /* SGIs */
+ irq->enabled = 1;
+ irq->config = VGIC_CONFIG_EDGE;
+ } else {
+ /* PPIs */
+ irq->config = VGIC_CONFIG_LEVEL;
+ }
+ }
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return 0;
+
+ /*
+ * If we are creating a VCPU with a GICv3 we must also register the
+ * KVM io device for the redistributor that belongs to this VCPU.
+ */
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ mutex_lock(&vcpu->kvm->lock);
+ ret = vgic_register_redist_iodev(vcpu);
+ mutex_unlock(&vcpu->kvm->lock);
+ }
+ return ret;
+}
+
+static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_enable(vcpu);
+ else
+ vgic_v3_enable(vcpu);
+}
+
+/*
+ * vgic_init: allocates and initializes dist and vcpu data structures
+ * depending on two dimensioning parameters:
+ * - the number of spis
+ * - the number of vcpus
+ * The function is generally called when nr_spis has been explicitly set
+ * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
+ * vgic_initialized() returns true when this function has succeeded.
+ * Must be called with kvm->lock held!
+ */
+int vgic_init(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct kvm_vcpu *vcpu;
+ int ret = 0, i, idx;
+
+ if (vgic_initialized(kvm))
+ return 0;
+
+ /* Are we also in the middle of creating a VCPU? */
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+ return -EBUSY;
+
+ /* freeze the number of spis */
+ if (!dist->nr_spis)
+ dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;
+
+ ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
+ if (ret)
+ goto out;
+
+ /* Initialize groups on CPUs created before the VGIC type was known */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+ struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+ switch (dist->vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ irq->group = 1;
+ irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ irq->group = 0;
+ irq->targets = 1U << idx;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ if (vgic_has_its(kvm))
+ vgic_lpi_translation_cache_init(kvm);
+
+ /*
+ * If we have GICv4.1 enabled, unconditionnaly request enable the
+ * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
+ * enable it if we present a virtual ITS to the guest.
+ */
+ if (vgic_supports_direct_msis(kvm)) {
+ ret = vgic_v4_init(kvm);
+ if (ret)
+ goto out;
+ }
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vgic_vcpu_enable(vcpu);
+
+ ret = kvm_vgic_setup_default_irq_routing(kvm);
+ if (ret)
+ goto out;
+
+ vgic_debug_init(kvm);
+
+ dist->implementation_rev = 2;
+ dist->initialized = true;
+
+out:
+ return ret;
+}
+
+static void kvm_vgic_dist_destroy(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_redist_region *rdreg, *next;
+
+ dist->ready = false;
+ dist->initialized = false;
+
+ kfree(dist->spis);
+ dist->spis = NULL;
+ dist->nr_spis = 0;
+
+ if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) {
+ list_del(&rdreg->list);
+ kfree(rdreg);
+ }
+ INIT_LIST_HEAD(&dist->rd_regions);
+ }
+
+ if (vgic_has_its(kvm))
+ vgic_lpi_translation_cache_destroy(kvm);
+
+ if (vgic_supports_direct_msis(kvm))
+ vgic_v4_teardown(kvm);
+}
+
+void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ /*
+ * Retire all pending LPIs on this vcpu anyway as we're
+ * going to destroy it.
+ */
+ vgic_flush_pending_lpis(vcpu);
+
+ INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+}
+
+/* To be called with kvm->lock held */
+static void __kvm_vgic_destroy(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ vgic_debug_destroy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vgic_vcpu_destroy(vcpu);
+
+ kvm_vgic_dist_destroy(kvm);
+}
+
+void kvm_vgic_destroy(struct kvm *kvm)
+{
+ mutex_lock(&kvm->lock);
+ __kvm_vgic_destroy(kvm);
+ mutex_unlock(&kvm->lock);
+}
+
+/**
+ * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
+ * is a GICv2. A GICv3 must be explicitly initialized by the guest using the
+ * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
+ * @kvm: kvm struct pointer
+ */
+int vgic_lazy_init(struct kvm *kvm)
+{
+ int ret = 0;
+
+ if (unlikely(!vgic_initialized(kvm))) {
+ /*
+ * We only provide the automatic initialization of the VGIC
+ * for the legacy case of a GICv2. Any other type must
+ * be explicitly initialized once setup with the respective
+ * KVM device call.
+ */
+ if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+ return -EBUSY;
+
+ mutex_lock(&kvm->lock);
+ ret = vgic_init(kvm);
+ mutex_unlock(&kvm->lock);
+ }
+
+ return ret;
+}
+
+/* RESOURCE MAPPING */
+
+/**
+ * Map the MMIO regions depending on the VGIC model exposed to the guest
+ * called on the first VCPU run.
+ * Also map the virtual CPU interface into the VM.
+ * v2/v3 derivatives call vgic_init if not already done.
+ * vgic_ready() returns true if this function has succeeded.
+ * @kvm: kvm struct pointer
+ */
+int kvm_vgic_map_resources(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ int ret = 0;
+
+ mutex_lock(&kvm->lock);
+ if (!irqchip_in_kernel(kvm))
+ goto out;
+
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
+ ret = vgic_v2_map_resources(kvm);
+ else
+ ret = vgic_v3_map_resources(kvm);
+
+ if (ret)
+ __kvm_vgic_destroy(kvm);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+/* GENERIC PROBE */
+
+static int vgic_init_cpu_starting(unsigned int cpu)
+{
+ enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
+ return 0;
+}
+
+
+static int vgic_init_cpu_dying(unsigned int cpu)
+{
+ disable_percpu_irq(kvm_vgic_global_state.maint_irq);
+ return 0;
+}
+
+static irqreturn_t vgic_maintenance_handler(int irq, void *data)
+{
+ /*
+ * We cannot rely on the vgic maintenance interrupt to be
+ * delivered synchronously. This means we can only use it to
+ * exit the VM, and we perform the handling of EOIed
+ * interrupts on the exit path (see vgic_fold_lr_state).
+ */
+ return IRQ_HANDLED;
+}
+
+/**
+ * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
+ *
+ * For a specific CPU, initialize the GIC VE hardware.
+ */
+void kvm_vgic_init_cpu_hardware(void)
+{
+ BUG_ON(preemptible());
+
+ /*
+ * We want to make sure the list registers start out clear so that we
+ * only have the program the used registers.
+ */
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_init_lrs();
+ else
+ kvm_call_hyp(__vgic_v3_init_lrs);
+}
+
+/**
+ * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
+ * according to the host GIC model. Accordingly calls either
+ * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
+ * instantiated by a guest later on .
+ */
+int kvm_vgic_hyp_init(void)
+{
+ const struct gic_kvm_info *gic_kvm_info;
+ int ret;
+
+ gic_kvm_info = gic_get_kvm_info();
+ if (!gic_kvm_info)
+ return -ENODEV;
+
+ if (!gic_kvm_info->maint_irq) {
+ kvm_err("No vgic maintenance irq\n");
+ return -ENXIO;
+ }
+
+ switch (gic_kvm_info->type) {
+ case GIC_V2:
+ ret = vgic_v2_probe(gic_kvm_info);
+ break;
+ case GIC_V3:
+ ret = vgic_v3_probe(gic_kvm_info);
+ if (!ret) {
+ static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
+ kvm_info("GIC system register CPU interface enabled\n");
+ }
+ break;
+ default:
+ ret = -ENODEV;
+ }
+
+ if (ret)
+ return ret;
+
+ kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;
+ ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
+ vgic_maintenance_handler,
+ "vgic", kvm_get_running_vcpus());
+ if (ret) {
+ kvm_err("Cannot register interrupt %d\n",
+ kvm_vgic_global_state.maint_irq);
+ return ret;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
+ "kvm/arm/vgic:starting",
+ vgic_init_cpu_starting, vgic_init_cpu_dying);
+ if (ret) {
+ kvm_err("Cannot register vgic CPU notifier\n");
+ goto out_free_irq;
+ }
+
+ kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
+ return 0;
+
+out_free_irq:
+ free_percpu_irq(kvm_vgic_global_state.maint_irq,
+ kvm_get_running_vcpus());
+ return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c
new file mode 100644
index 000000000000..d8cdfea5cc96
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-irqfd.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+#include <kvm/arm_vgic.h>
+#include "vgic.h"
+
+/**
+ * vgic_irqfd_set_irq: inject the IRQ corresponding to the
+ * irqchip routing entry
+ *
+ * This is the entry point for irqfd IRQ injection
+ */
+static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;
+
+ if (!vgic_valid_spi(kvm, spi_id))
+ return -EINVAL;
+ return kvm_vgic_inject_irq(kvm, 0, spi_id, level, NULL);
+}
+
+/**
+ * kvm_set_routing_entry: populate a kvm routing entry
+ * from a user routing entry
+ *
+ * @kvm: the VM this entry is applied to
+ * @e: kvm kernel routing entry handle
+ * @ue: user api routing entry handle
+ * return 0 on success, -EINVAL on errors.
+ */
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ e->set = vgic_irqfd_set_irq;
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin;
+ if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
+ (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
+ goto out;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ e->msi.flags = ue->flags;
+ e->msi.devid = ue->u.msi.devid;
+ break;
+ default:
+ goto out;
+ }
+ r = 0;
+out:
+ return r;
+}
+
+static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_msi *msi)
+{
+ msi->address_lo = e->msi.address_lo;
+ msi->address_hi = e->msi.address_hi;
+ msi->data = e->msi.data;
+ msi->flags = e->msi.flags;
+ msi->devid = e->msi.devid;
+}
+/**
+ * kvm_set_msi: inject the MSI corresponding to the
+ * MSI routing entry
+ *
+ * This is the entry point for irqfd MSI injection
+ * and userspace MSI injection.
+ */
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id,
+ int level, bool line_status)
+{
+ struct kvm_msi msi;
+
+ if (!vgic_has_its(kvm))
+ return -ENODEV;
+
+ if (!level)
+ return -1;
+
+ kvm_populate_msi(e, &msi);
+ return vgic_its_inject_msi(kvm, &msi);
+}
+
+/**
+ * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
+ *
+ * Currently only direct MSI injection is supported.
+ */
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (e->type == KVM_IRQ_ROUTING_MSI && vgic_has_its(kvm) && level) {
+ struct kvm_msi msi;
+
+ kvm_populate_msi(e, &msi);
+ if (!vgic_its_inject_cached_translation(kvm, &msi))
+ return 0;
+ }
+
+ return -EWOULDBLOCK;
+}
+
+int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
+{
+ struct kvm_irq_routing_entry *entries;
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ u32 nr = dist->nr_spis;
+ int i, ret;
+
+ entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ for (i = 0; i < nr; i++) {
+ entries[i].gsi = i;
+ entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
+ entries[i].u.irqchip.irqchip = 0;
+ entries[i].u.irqchip.pin = i;
+ }
+ ret = kvm_set_irq_routing(kvm, entries, nr, 0);
+ kfree(entries);
+ return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
new file mode 100644
index 000000000000..c012a52b19f5
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -0,0 +1,2783 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+#include <linux/list_sort.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+static int vgic_its_save_tables_v0(struct vgic_its *its);
+static int vgic_its_restore_tables_v0(struct vgic_its *its);
+static int vgic_its_commit_v0(struct vgic_its *its);
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+ struct kvm_vcpu *filter_vcpu, bool needs_inv);
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
+ struct kvm_vcpu *vcpu)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+ unsigned long flags;
+ int ret;
+
+ /* In this case there is no put, since we keep the reference. */
+ if (irq)
+ return irq;
+
+ irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+ if (!irq)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&irq->lpi_list);
+ INIT_LIST_HEAD(&irq->ap_list);
+ raw_spin_lock_init(&irq->irq_lock);
+
+ irq->config = VGIC_CONFIG_EDGE;
+ kref_init(&irq->refcount);
+ irq->intid = intid;
+ irq->target_vcpu = vcpu;
+ irq->group = 1;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+ /*
+ * There could be a race with another vgic_add_lpi(), so we need to
+ * check that we don't add a second list entry with the same LPI.
+ */
+ list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+ if (oldirq->intid != intid)
+ continue;
+
+ /* Someone was faster with adding this LPI, lets use that. */
+ kfree(irq);
+ irq = oldirq;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() on the returned pointer once it's
+ * finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+
+ goto out_unlock;
+ }
+
+ list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+ dist->lpi_list_count++;
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+ /*
+ * We "cache" the configuration table entries in our struct vgic_irq's.
+ * However we only have those structs for mapped IRQs, so we read in
+ * the respective config data from memory here upon mapping the LPI.
+ *
+ * Should any of these fail, behave as if we couldn't create the LPI
+ * by dropping the refcount and returning the error.
+ */
+ ret = update_lpi_config(kvm, irq, NULL, false);
+ if (ret) {
+ vgic_put_irq(kvm, irq);
+ return ERR_PTR(ret);
+ }
+
+ ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
+ if (ret) {
+ vgic_put_irq(kvm, irq);
+ return ERR_PTR(ret);
+ }
+
+ return irq;
+}
+
+struct its_device {
+ struct list_head dev_list;
+
+ /* the head for the list of ITTEs */
+ struct list_head itt_head;
+ u32 num_eventid_bits;
+ gpa_t itt_addr;
+ u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+ struct list_head coll_list;
+
+ u32 collection_id;
+ u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+ ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_ite {
+ struct list_head ite_list;
+
+ struct vgic_irq *irq;
+ struct its_collection *collection;
+ u32 event_id;
+};
+
+struct vgic_translation_cache_entry {
+ struct list_head entry;
+ phys_addr_t db;
+ u32 devid;
+ u32 eventid;
+ struct vgic_irq *irq;
+};
+
+/**
+ * struct vgic_its_abi - ITS abi ops and settings
+ * @cte_esz: collection table entry size
+ * @dte_esz: device table entry size
+ * @ite_esz: interrupt translation table entry size
+ * @save tables: save the ITS tables into guest RAM
+ * @restore_tables: restore the ITS internal structs from tables
+ * stored in guest RAM
+ * @commit: initialize the registers which expose the ABI settings,
+ * especially the entry sizes
+ */
+struct vgic_its_abi {
+ int cte_esz;
+ int dte_esz;
+ int ite_esz;
+ int (*save_tables)(struct vgic_its *its);
+ int (*restore_tables)(struct vgic_its *its);
+ int (*commit)(struct vgic_its *its);
+};
+
+#define ABI_0_ESZ 8
+#define ESZ_MAX ABI_0_ESZ
+
+static const struct vgic_its_abi its_table_abi_versions[] = {
+ [0] = {
+ .cte_esz = ABI_0_ESZ,
+ .dte_esz = ABI_0_ESZ,
+ .ite_esz = ABI_0_ESZ,
+ .save_tables = vgic_its_save_tables_v0,
+ .restore_tables = vgic_its_restore_tables_v0,
+ .commit = vgic_its_commit_v0,
+ },
+};
+
+#define NR_ITS_ABIS ARRAY_SIZE(its_table_abi_versions)
+
+inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
+{
+ return &its_table_abi_versions[its->abi_rev];
+}
+
+static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
+{
+ const struct vgic_its_abi *abi;
+
+ its->abi_rev = rev;
+ abi = vgic_its_get_abi(its);
+ return abi->commit(its);
+}
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+ struct its_device *device;
+
+ list_for_each_entry(device, &its->device_list, dev_list)
+ if (device_id == device->device_id)
+ return device;
+
+ return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
+ u32 event_id)
+{
+ struct its_device *device;
+ struct its_ite *ite;
+
+ device = find_its_device(its, device_id);
+ if (device == NULL)
+ return NULL;
+
+ list_for_each_entry(ite, &device->itt_head, ite_list)
+ if (ite->event_id == event_id)
+ return ite;
+
+ return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, ite, its) \
+ list_for_each_entry(dev, &(its)->device_list, dev_list) \
+ list_for_each_entry(ite, &(dev)->itt_head, ite_list)
+
+#define GIC_LPI_OFFSET 8192
+
+#define VITS_TYPER_IDBITS 16
+#define VITS_TYPER_DEVBITS 16
+#define VITS_DTE_MAX_DEVID_OFFSET (BIT(14) - 1)
+#define VITS_ITE_MAX_EVENTID_OFFSET (BIT(16) - 1)
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+ struct its_collection *collection;
+
+ list_for_each_entry(collection, &its->collection_list, coll_list) {
+ if (coll_id == collection->collection_id)
+ return collection;
+ }
+
+ return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p) ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+ struct kvm_vcpu *filter_vcpu, bool needs_inv)
+{
+ u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+ u8 prop;
+ int ret;
+ unsigned long flags;
+
+ ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+ &prop, 1);
+
+ if (ret)
+ return ret;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+ irq->priority = LPI_PROP_PRIORITY(prop);
+ irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+ if (!irq->hw) {
+ vgic_queue_irq_unlock(kvm, irq, flags);
+ return 0;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ if (irq->hw)
+ return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
+
+ return 0;
+}
+
+/*
+ * Create a snapshot of the current LPIs targeting @vcpu, so that we can
+ * enumerate those LPIs without holding any lock.
+ * Returns their number and puts the kmalloc'ed array into intid_ptr.
+ */
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u32 *intids;
+ int irq_count, i = 0;
+
+ /*
+ * There is an obvious race between allocating the array and LPIs
+ * being mapped/unmapped. If we ended up here as a result of a
+ * command, we're safe (locks are held, preventing another
+ * command). If coming from another path (such as enabling LPIs),
+ * we must be careful not to overrun the array.
+ */
+ irq_count = READ_ONCE(dist->lpi_list_count);
+ intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+ if (!intids)
+ return -ENOMEM;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ if (i == irq_count)
+ break;
+ /* We don't need to "get" the IRQ, as we hold the list lock. */
+ if (vcpu && irq->target_vcpu != vcpu)
+ continue;
+ intids[i++] = irq->intid;
+ }
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+ *intid_ptr = intids;
+ return i;
+}
+
+static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
+{
+ int ret = 0;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->target_vcpu = vcpu;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ if (irq->hw) {
+ struct its_vlpi_map map;
+
+ ret = its_get_vlpi(irq->host_irq, &map);
+ if (ret)
+ return ret;
+
+ if (map.vpe)
+ atomic_dec(&map.vpe->vlpi_count);
+ map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ atomic_inc(&map.vpe->vlpi_count);
+
+ ret = its_map_vlpi(irq->host_irq, &map);
+ }
+
+ return ret;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!its_is_collection_mapped(ite->collection))
+ return;
+
+ vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+ update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+ struct its_collection *coll)
+{
+ struct its_device *device;
+ struct its_ite *ite;
+
+ for_each_lpi_its(device, ite, its) {
+ if (!ite->collection || coll != ite->collection)
+ continue;
+
+ update_affinity_ite(kvm, ite);
+ }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+ int nr_idbits = (propbaser & 0x1f) + 1;
+
+ return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Sync the pending table pending bit of LPIs targeting @vcpu
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+ gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+ struct vgic_irq *irq;
+ int last_byte_offset = -1;
+ int ret = 0;
+ u32 *intids;
+ int nr_irqs, i;
+ unsigned long flags;
+ u8 pendmask;
+
+ nr_irqs = vgic_copy_lpi_list(vcpu->kvm, vcpu, &intids);
+ if (nr_irqs < 0)
+ return nr_irqs;
+
+ for (i = 0; i < nr_irqs; i++) {
+ int byte_offset, bit_nr;
+
+ byte_offset = intids[i] / BITS_PER_BYTE;
+ bit_nr = intids[i] % BITS_PER_BYTE;
+
+ /*
+ * For contiguously allocated LPIs chances are we just read
+ * this very same byte in the last iteration. Reuse that.
+ */
+ if (byte_offset != last_byte_offset) {
+ ret = kvm_read_guest_lock(vcpu->kvm,
+ pendbase + byte_offset,
+ &pendmask, 1);
+ if (ret) {
+ kfree(intids);
+ return ret;
+ }
+ last_byte_offset = byte_offset;
+ }
+
+ irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = pendmask & (1U << bit_nr);
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ kfree(intids);
+
+ return ret;
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 reg = GITS_TYPER_PLPIS;
+
+ /*
+ * We use linear CPU numbers for redistributor addressing,
+ * so GITS_TYPER.PTA is 0.
+ * Also we force all PROPBASER registers to be the same, so
+ * CommonLPIAff is 0 as well.
+ * To avoid memory waste in the guest, we keep the number of IDBits and
+ * DevBits low - as least for the time being.
+ */
+ reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
+ reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
+ reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u32 val;
+
+ val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
+ val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
+ return val;
+}
+
+static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 rev = GITS_IIDR_REV(val);
+
+ if (rev >= NR_ITS_ABIS)
+ return -EINVAL;
+ return vgic_its_set_abi(its, rev);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ switch (addr & 0xffff) {
+ case GITS_PIDR0:
+ return 0x92; /* part number, bits[7:0] */
+ case GITS_PIDR1:
+ return 0xb4; /* part number, bits[11:8] */
+ case GITS_PIDR2:
+ return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+ case GITS_PIDR4:
+ return 0x40; /* This is a 64K software visible page */
+ /* The following are the ID registers for (any) GIC. */
+ case GITS_CIDR0:
+ return 0x0d;
+ case GITS_CIDR1:
+ return 0xf0;
+ case GITS_CIDR2:
+ return 0x05;
+ case GITS_CIDR3:
+ return 0xb1;
+ }
+
+ return 0;
+}
+
+static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
+ phys_addr_t db,
+ u32 devid, u32 eventid)
+{
+ struct vgic_translation_cache_entry *cte;
+
+ list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+ /*
+ * If we hit a NULL entry, there is nothing after this
+ * point.
+ */
+ if (!cte->irq)
+ break;
+
+ if (cte->db != db || cte->devid != devid ||
+ cte->eventid != eventid)
+ continue;
+
+ /*
+ * Move this entry to the head, as it is the most
+ * recently used.
+ */
+ if (!list_is_first(&cte->entry, &dist->lpi_translation_cache))
+ list_move(&cte->entry, &dist->lpi_translation_cache);
+
+ return cte->irq;
+ }
+
+ return NULL;
+}
+
+static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
+ u32 devid, u32 eventid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ irq = __vgic_its_check_cache(dist, db, devid, eventid);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+ return irq;
+}
+
+static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid,
+ struct vgic_irq *irq)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_translation_cache_entry *cte;
+ unsigned long flags;
+ phys_addr_t db;
+
+ /* Do not cache a directly injected interrupt */
+ if (irq->hw)
+ return;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+ if (unlikely(list_empty(&dist->lpi_translation_cache)))
+ goto out;
+
+ /*
+ * We could have raced with another CPU caching the same
+ * translation behind our back, so let's check it is not in
+ * already
+ */
+ db = its->vgic_its_base + GITS_TRANSLATER;
+ if (__vgic_its_check_cache(dist, db, devid, eventid))
+ goto out;
+
+ /* Always reuse the last entry (LRU policy) */
+ cte = list_last_entry(&dist->lpi_translation_cache,
+ typeof(*cte), entry);
+
+ /*
+ * Caching the translation implies having an extra reference
+ * to the interrupt, so drop the potential reference on what
+ * was in the cache, and increment it on the new interrupt.
+ */
+ if (cte->irq)
+ __vgic_put_lpi_locked(kvm, cte->irq);
+
+ vgic_get_irq_kref(irq);
+
+ cte->db = db;
+ cte->devid = devid;
+ cte->eventid = eventid;
+ cte->irq = irq;
+
+ /* Move the new translation to the head of the list */
+ list_move(&cte->entry, &dist->lpi_translation_cache);
+
+out:
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_its_invalidate_cache(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_translation_cache_entry *cte;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+ list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
+ /*
+ * If we hit a NULL entry, there is nothing after this
+ * point.
+ */
+ if (!cte->irq)
+ break;
+
+ __vgic_put_lpi_locked(kvm, cte->irq);
+ cte->irq = NULL;
+ }
+
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid, struct vgic_irq **irq)
+{
+ struct kvm_vcpu *vcpu;
+ struct its_ite *ite;
+
+ if (!its->enabled)
+ return -EBUSY;
+
+ ite = find_ite(its, devid, eventid);
+ if (!ite || !its_is_collection_mapped(ite->collection))
+ return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+ vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr);
+ if (!vcpu)
+ return E_ITS_INT_UNMAPPED_INTERRUPT;
+
+ if (!vcpu->arch.vgic_cpu.lpis_enabled)
+ return -EBUSY;
+
+ vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);
+
+ *irq = ite->irq;
+ return 0;
+}
+
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
+{
+ u64 address;
+ struct kvm_io_device *kvm_io_dev;
+ struct vgic_io_device *iodev;
+
+ if (!vgic_has_its(kvm))
+ return ERR_PTR(-ENODEV);
+
+ if (!(msi->flags & KVM_MSI_VALID_DEVID))
+ return ERR_PTR(-EINVAL);
+
+ address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+ kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+ if (!kvm_io_dev)
+ return ERR_PTR(-EINVAL);
+
+ if (kvm_io_dev->ops != &kvm_io_gic_ops)
+ return ERR_PTR(-EINVAL);
+
+ iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+ if (iodev->iodev_type != IODEV_ITS)
+ return ERR_PTR(-EINVAL);
+
+ return iodev->its;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ * Returns 0 on success, a positive error value for any ITS mapping
+ * related errors and negative error values for generic errors.
+ */
+static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid)
+{
+ struct vgic_irq *irq = NULL;
+ unsigned long flags;
+ int err;
+
+ err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
+ if (err)
+ return err;
+
+ if (irq->hw)
+ return irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING, true);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(kvm, irq, flags);
+
+ return 0;
+}
+
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+ phys_addr_t db;
+
+ db = (u64)msi->address_hi << 32 | msi->address_lo;
+ irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
+
+ if (!irq)
+ return -1;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(kvm, irq, flags);
+
+ return 0;
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ * According to the KVM_SIGNAL_MSI API description returns 1 on success.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ struct vgic_its *its;
+ int ret;
+
+ if (!vgic_its_inject_cached_translation(kvm, msi))
+ return 1;
+
+ its = vgic_msi_to_its(kvm, msi);
+ if (IS_ERR(its))
+ return PTR_ERR(its);
+
+ mutex_lock(&its->its_lock);
+ ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
+ mutex_unlock(&its->its_lock);
+
+ if (ret < 0)
+ return ret;
+
+ /*
+ * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
+ * if the guest has blocked the MSI. So we map any LPI mapping
+ * related error to that.
+ */
+ if (ret)
+ return 0;
+ else
+ return 1;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
+{
+ list_del(&ite->ite_list);
+
+ /* This put matches the get in vgic_add_lpi. */
+ if (ite->irq) {
+ if (ite->irq->hw)
+ WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+
+ vgic_put_irq(kvm, ite->irq);
+ }
+
+ kfree(ite);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+ return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8)
+#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_size(cmd) (its_cmd_mask_field(cmd, 1, 0, 5) + 1)
+#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32)
+#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16)
+#define its_cmd_get_ittaddr(cmd) (its_cmd_mask_field(cmd, 2, 8, 44) << 8)
+#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_ite *ite;
+
+ ite = find_ite(its, device_id, event_id);
+ if (ite && its_is_collection_mapped(ite->collection)) {
+ /*
+ * Though the spec talks about removing the pending state, we
+ * don't bother here since we clear the ITTE anyway and the
+ * pending state is a property of the ITTE struct.
+ */
+ vgic_its_invalidate_cache(kvm);
+
+ its_free_ite(kvm, ite);
+ return 0;
+ }
+
+ return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct kvm_vcpu *vcpu;
+ struct its_ite *ite;
+ struct its_collection *collection;
+
+ ite = find_ite(its, device_id, event_id);
+ if (!ite)
+ return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+ if (!its_is_collection_mapped(ite->collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ ite->collection = collection;
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ vgic_its_invalidate_cache(kvm);
+
+ return update_affinity(ite->irq, vcpu);
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
+ gpa_t *eaddr)
+{
+ int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+ u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
+ phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
+ int esz = GITS_BASER_ENTRY_SIZE(baser);
+ int index, idx;
+ gfn_t gfn;
+ bool ret;
+
+ switch (type) {
+ case GITS_BASER_TYPE_DEVICE:
+ if (id >= BIT_ULL(VITS_TYPER_DEVBITS))
+ return false;
+ break;
+ case GITS_BASER_TYPE_COLLECTION:
+ /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
+ if (id >= BIT_ULL(16))
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ if (!(baser & GITS_BASER_INDIRECT)) {
+ phys_addr_t addr;
+
+ if (id >= (l1_tbl_size / esz))
+ return false;
+
+ addr = base + id * esz;
+ gfn = addr >> PAGE_SHIFT;
+
+ if (eaddr)
+ *eaddr = addr;
+
+ goto out;
+ }
+
+ /* calculate and check the index into the 1st level */
+ index = id / (SZ_64K / esz);
+ if (index >= (l1_tbl_size / sizeof(u64)))
+ return false;
+
+ /* Each 1st level entry is represented by a 64-bit value. */
+ if (kvm_read_guest_lock(its->dev->kvm,
+ base + index * sizeof(indirect_ptr),
+ &indirect_ptr, sizeof(indirect_ptr)))
+ return false;
+
+ indirect_ptr = le64_to_cpu(indirect_ptr);
+
+ /* check the valid bit of the first level entry */
+ if (!(indirect_ptr & BIT_ULL(63)))
+ return false;
+
+ /* Mask the guest physical address and calculate the frame number. */
+ indirect_ptr &= GENMASK_ULL(51, 16);
+
+ /* Find the address of the actual entry */
+ index = id % (SZ_64K / esz);
+ indirect_ptr += index * esz;
+ gfn = indirect_ptr >> PAGE_SHIFT;
+
+ if (eaddr)
+ *eaddr = indirect_ptr;
+
+out:
+ idx = srcu_read_lock(&its->dev->kvm->srcu);
+ ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+ srcu_read_unlock(&its->dev->kvm->srcu, idx);
+ return ret;
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+ struct its_collection **colp,
+ u32 coll_id)
+{
+ struct its_collection *collection;
+
+ if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
+ return E_ITS_MAPC_COLLECTION_OOR;
+
+ collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+ if (!collection)
+ return -ENOMEM;
+
+ collection->collection_id = coll_id;
+ collection->target_addr = COLLECTION_NOT_MAPPED;
+
+ list_add_tail(&collection->coll_list, &its->collection_list);
+ *colp = collection;
+
+ return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+ struct its_collection *collection;
+ struct its_device *device;
+ struct its_ite *ite;
+
+ /*
+ * Clearing the mapping for that collection ID removes the
+ * entry from the list. If there wasn't any before, we can
+ * go home early.
+ */
+ collection = find_collection(its, coll_id);
+ if (!collection)
+ return;
+
+ for_each_lpi_its(device, ite, its)
+ if (ite->collection &&
+ ite->collection->collection_id == coll_id)
+ ite->collection = NULL;
+
+ list_del(&collection->coll_list);
+ kfree(collection);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
+ struct its_collection *collection,
+ u32 event_id)
+{
+ struct its_ite *ite;
+
+ ite = kzalloc(sizeof(*ite), GFP_KERNEL);
+ if (!ite)
+ return ERR_PTR(-ENOMEM);
+
+ ite->event_id = event_id;
+ ite->collection = collection;
+
+ list_add_tail(&ite->ite_list, &device->itt_head);
+ return ite;
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_ite *ite;
+ struct kvm_vcpu *vcpu = NULL;
+ struct its_device *device;
+ struct its_collection *collection, *new_coll = NULL;
+ struct vgic_irq *irq;
+ int lpi_nr;
+
+ device = find_its_device(its, device_id);
+ if (!device)
+ return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+ if (event_id >= BIT_ULL(device->num_eventid_bits))
+ return E_ITS_MAPTI_ID_OOR;
+
+ if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+ lpi_nr = its_cmd_get_physical_id(its_cmd);
+ else
+ lpi_nr = event_id;
+ if (lpi_nr < GIC_LPI_OFFSET ||
+ lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+ return E_ITS_MAPTI_PHYSICALID_OOR;
+
+ /* If there is an existing mapping, behavior is UNPREDICTABLE. */
+ if (find_ite(its, device_id, event_id))
+ return 0;
+
+ collection = find_collection(its, coll_id);
+ if (!collection) {
+ int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+ if (ret)
+ return ret;
+ new_coll = collection;
+ }
+
+ ite = vgic_its_alloc_ite(device, collection, event_id);
+ if (IS_ERR(ite)) {
+ if (new_coll)
+ vgic_its_free_collection(its, coll_id);
+ return PTR_ERR(ite);
+ }
+
+ if (its_is_collection_mapped(collection))
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
+ if (IS_ERR(irq)) {
+ if (new_coll)
+ vgic_its_free_collection(its, coll_id);
+ its_free_ite(kvm, ite);
+ return PTR_ERR(irq);
+ }
+ ite->irq = irq;
+
+ return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_free_device(struct kvm *kvm, struct its_device *device)
+{
+ struct its_ite *ite, *temp;
+
+ /*
+ * The spec says that unmapping a device with still valid
+ * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+ * since we cannot leave the memory unreferenced.
+ */
+ list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
+ its_free_ite(kvm, ite);
+
+ vgic_its_invalidate_cache(kvm);
+
+ list_del(&device->dev_list);
+ kfree(device);
+}
+
+/* its lock must be held */
+static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
+{
+ struct its_device *cur, *temp;
+
+ list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
+ vgic_its_free_device(kvm, cur);
+}
+
+/* its lock must be held */
+static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
+{
+ struct its_collection *cur, *temp;
+
+ list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
+ vgic_its_free_collection(its, cur->collection_id);
+}
+
+/* Must be called with its_lock mutex held */
+static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
+ u32 device_id, gpa_t itt_addr,
+ u8 num_eventid_bits)
+{
+ struct its_device *device;
+
+ device = kzalloc(sizeof(*device), GFP_KERNEL);
+ if (!device)
+ return ERR_PTR(-ENOMEM);
+
+ device->device_id = device_id;
+ device->itt_addr = itt_addr;
+ device->num_eventid_bits = num_eventid_bits;
+ INIT_LIST_HEAD(&device->itt_head);
+
+ list_add_tail(&device->dev_list, &its->device_list);
+ return device;
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ bool valid = its_cmd_get_validbit(its_cmd);
+ u8 num_eventid_bits = its_cmd_get_size(its_cmd);
+ gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
+ struct its_device *device;
+
+ if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL))
+ return E_ITS_MAPD_DEVICE_OOR;
+
+ if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
+ return E_ITS_MAPD_ITTSIZE_OOR;
+
+ device = find_its_device(its, device_id);
+
+ /*
+ * The spec says that calling MAPD on an already mapped device
+ * invalidates all cached data for this device. We implement this
+ * by removing the mapping and re-establishing it.
+ */
+ if (device)
+ vgic_its_free_device(kvm, device);
+
+ /*
+ * The spec does not say whether unmapping a not-mapped device
+ * is an error, so we are done in any case.
+ */
+ if (!valid)
+ return 0;
+
+ device = vgic_its_alloc_device(its, device_id, itt_addr,
+ num_eventid_bits);
+
+ return PTR_ERR_OR_ZERO(device);
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u16 coll_id;
+ u32 target_addr;
+ struct its_collection *collection;
+ bool valid;
+
+ valid = its_cmd_get_validbit(its_cmd);
+ coll_id = its_cmd_get_collection(its_cmd);
+ target_addr = its_cmd_get_target_addr(its_cmd);
+
+ if (target_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MAPC_PROCNUM_OOR;
+
+ if (!valid) {
+ vgic_its_free_collection(its, coll_id);
+ vgic_its_invalidate_cache(kvm);
+ } else {
+ collection = find_collection(its, coll_id);
+
+ if (!collection) {
+ int ret;
+
+ ret = vgic_its_alloc_collection(its, &collection,
+ coll_id);
+ if (ret)
+ return ret;
+ collection->target_addr = target_addr;
+ } else {
+ collection->target_addr = target_addr;
+ update_affinity_collection(kvm, its, collection);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_ite *ite;
+
+
+ ite = find_ite(its, device_id, event_id);
+ if (!ite)
+ return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+ ite->irq->pending_latch = false;
+
+ if (ite->irq->hw)
+ return irq_set_irqchip_state(ite->irq->host_irq,
+ IRQCHIP_STATE_PENDING, false);
+
+ return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_ite *ite;
+
+
+ ite = find_ite(its, device_id, event_id);
+ if (!ite)
+ return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+ return update_lpi_config(kvm, ite->irq, NULL, true);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_collection *collection;
+ struct kvm_vcpu *vcpu;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count, i;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ irq_count = vgic_copy_lpi_list(kvm, vcpu, &intids);
+ if (irq_count < 0)
+ return irq_count;
+
+ for (i = 0; i < irq_count; i++) {
+ irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+ update_lpi_config(kvm, irq, vcpu, false);
+ vgic_put_irq(kvm, irq);
+ }
+
+ kfree(intids);
+
+ if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
+ its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);
+
+ return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+ u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+ struct kvm_vcpu *vcpu1, *vcpu2;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count, i;
+
+ if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+ target2_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MOVALL_PROCNUM_OOR;
+
+ if (target1_addr == target2_addr)
+ return 0;
+
+ vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+ vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+ irq_count = vgic_copy_lpi_list(kvm, vcpu1, &intids);
+ if (irq_count < 0)
+ return irq_count;
+
+ for (i = 0; i < irq_count; i++) {
+ irq = vgic_get_irq(kvm, NULL, intids[i]);
+
+ update_affinity(irq, vcpu2);
+
+ vgic_put_irq(kvm, irq);
+ }
+
+ vgic_its_invalidate_cache(kvm);
+
+ kfree(intids);
+ return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 msi_data = its_cmd_get_id(its_cmd);
+ u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+ return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&its->its_lock);
+ switch (its_cmd_get_command(its_cmd)) {
+ case GITS_CMD_MAPD:
+ ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPC:
+ ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPTI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVI:
+ ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_DISCARD:
+ ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_CLEAR:
+ ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVALL:
+ ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INT:
+ ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INV:
+ ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INVALL:
+ ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_SYNC:
+ /* we ignore this command: we are in sync all of the time */
+ ret = 0;
+ break;
+ }
+ mutex_unlock(&its->its_lock);
+
+ return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+ GITS_BASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+ GITS_BASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+ GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /* We support only one (ITS) page size: 64K */
+ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+ return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+ GITS_CBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+ GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+ GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /* Sanitise the physical address to be 64k aligned. */
+ reg &= ~GENMASK_ULL(15, 12);
+
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ /* When GITS_CTLR.Enable is 1, this register is RO. */
+ if (its->enabled)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+ its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+ its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+ its->creadr = 0;
+ /*
+ * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+ * it to CREADR to make sure we start with an empty command buffer.
+ */
+ its->cwriter = its->creadr;
+ mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE 32
+#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5))
+
+/* Must be called with the cmd_lock held. */
+static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
+{
+ gpa_t cbaser;
+ u64 cmd_buf[4];
+
+ /* Commands are only processed when the ITS is enabled. */
+ if (!its->enabled)
+ return;
+
+ cbaser = GITS_CBASER_ADDRESS(its->cbaser);
+
+ while (its->cwriter != its->creadr) {
+ int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
+ cmd_buf, ITS_CMD_SIZE);
+ /*
+ * If kvm_read_guest() fails, this could be due to the guest
+ * programming a bogus value in CBASER or something else going
+ * wrong from which we cannot easily recover.
+ * According to section 6.3.2 in the GICv3 spec we can just
+ * ignore that command then.
+ */
+ if (!ret)
+ vgic_its_handle_command(kvm, its, cmd_buf);
+
+ its->creadr += ITS_CMD_SIZE;
+ if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+ its->creadr = 0;
+ }
+}
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u64 reg;
+
+ if (!its)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+
+ reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+ reg = ITS_CMD_OFFSET(reg);
+ if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+ mutex_unlock(&its->cmd_lock);
+ return;
+ }
+ its->cwriter = reg;
+
+ vgic_its_process_commands(kvm, its);
+
+ mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 cmd_offset;
+ int ret = 0;
+
+ mutex_lock(&its->cmd_lock);
+
+ if (its->enabled) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ cmd_offset = ITS_CMD_OFFSET(val);
+ if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ its->creadr = cmd_offset;
+out:
+ mutex_unlock(&its->cmd_lock);
+ return ret;
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u64 reg;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ reg = its->baser_device_table;
+ break;
+ case 1:
+ reg = its->baser_coll_table;
+ break;
+ default:
+ reg = 0;
+ break;
+ }
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 entry_size, table_type;
+ u64 reg, *regptr, clearbits = 0;
+
+ /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+ if (its->enabled)
+ return;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ regptr = &its->baser_device_table;
+ entry_size = abi->dte_esz;
+ table_type = GITS_BASER_TYPE_DEVICE;
+ break;
+ case 1:
+ regptr = &its->baser_coll_table;
+ entry_size = abi->cte_esz;
+ table_type = GITS_BASER_TYPE_COLLECTION;
+ clearbits = GITS_BASER_INDIRECT;
+ break;
+ default:
+ return;
+ }
+
+ reg = update_64bit_reg(*regptr, addr & 7, len, val);
+ reg &= ~GITS_BASER_RO_MASK;
+ reg &= ~clearbits;
+
+ reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+ reg |= table_type << GITS_BASER_TYPE_SHIFT;
+ reg = vgic_sanitise_its_baser(reg);
+
+ *regptr = reg;
+
+ if (!(reg & GITS_BASER_VALID)) {
+ /* Take the its_lock to prevent a race with a save/restore */
+ mutex_lock(&its->its_lock);
+ switch (table_type) {
+ case GITS_BASER_TYPE_DEVICE:
+ vgic_its_free_device_list(kvm, its);
+ break;
+ case GITS_BASER_TYPE_COLLECTION:
+ vgic_its_free_collection_list(kvm, its);
+ break;
+ }
+ mutex_unlock(&its->its_lock);
+ }
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u32 reg = 0;
+
+ mutex_lock(&its->cmd_lock);
+ if (its->creadr == its->cwriter)
+ reg |= GITS_CTLR_QUIESCENT;
+ if (its->enabled)
+ reg |= GITS_CTLR_ENABLE;
+ mutex_unlock(&its->cmd_lock);
+
+ return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ mutex_lock(&its->cmd_lock);
+
+ /*
+ * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
+ * device/collection BASER are invalid
+ */
+ if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
+ (!(its->baser_device_table & GITS_BASER_VALID) ||
+ !(its->baser_coll_table & GITS_BASER_VALID) ||
+ !(its->cbaser & GITS_CBASER_VALID)))
+ goto out;
+
+ its->enabled = !!(val & GITS_CTLR_ENABLE);
+ if (!its->enabled)
+ vgic_its_invalidate_cache(kvm);
+
+ /*
+ * Try to process any pending commands. This function bails out early
+ * if the ITS is disabled or no commands have been queued.
+ */
+ vgic_its_process_commands(kvm, its);
+
+out:
+ mutex_unlock(&its->cmd_lock);
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \
+{ \
+ .reg_offset = off, \
+ .len = length, \
+ .access_flags = acc, \
+ .its_read = rd, \
+ .its_write = wr, \
+}
+
+#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
+{ \
+ .reg_offset = off, \
+ .len = length, \
+ .access_flags = acc, \
+ .its_read = rd, \
+ .its_write = wr, \
+ .uaccess_its_write = uwr, \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len, unsigned long val)
+{
+ /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+ REGISTER_ITS_DESC(GITS_CTLR,
+ vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
+ vgic_mmio_read_its_iidr, its_mmio_write_wi,
+ vgic_mmio_uaccess_write_its_iidr, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_TYPER,
+ vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CBASER,
+ vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CWRITER,
+ vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
+ vgic_mmio_read_its_creadr, its_mmio_write_wi,
+ vgic_mmio_uaccess_write_its_creadr, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_BASER,
+ vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+ vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+ VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+ its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
+ u64 addr)
+{
+ struct vgic_io_device *iodev = &its->iodev;
+ int ret;
+
+ mutex_lock(&kvm->slots_lock);
+ if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ its->vgic_its_base = addr;
+ iodev->regions = its_registers;
+ iodev->nr_regions = ARRAY_SIZE(its_registers);
+ kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+ iodev->base_addr = its->vgic_its_base;
+ iodev->iodev_type = IODEV_ITS;
+ iodev->its = its;
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+ KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+out:
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
+
+/* Default is 16 cached LPIs per vcpu */
+#define LPI_DEFAULT_PCPU_CACHE_SIZE 16
+
+void vgic_lpi_translation_cache_init(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned int sz;
+ int i;
+
+ if (!list_empty(&dist->lpi_translation_cache))
+ return;
+
+ sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
+
+ for (i = 0; i < sz; i++) {
+ struct vgic_translation_cache_entry *cte;
+
+ /* An allocation failure is not fatal */
+ cte = kzalloc(sizeof(*cte), GFP_KERNEL);
+ if (WARN_ON(!cte))
+ break;
+
+ INIT_LIST_HEAD(&cte->entry);
+ list_add(&cte->entry, &dist->lpi_translation_cache);
+ }
+}
+
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_translation_cache_entry *cte, *tmp;
+
+ vgic_its_invalidate_cache(kvm);
+
+ list_for_each_entry_safe(cte, tmp,
+ &dist->lpi_translation_cache, entry) {
+ list_del(&cte->entry);
+ kfree(cte);
+ }
+}
+
+#define INITIAL_BASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \
+ GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+ struct vgic_its *its;
+
+ if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+ return -ENODEV;
+
+ its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+ if (!its)
+ return -ENOMEM;
+
+ if (vgic_initialized(dev->kvm)) {
+ int ret = vgic_v4_init(dev->kvm);
+ if (ret < 0) {
+ kfree(its);
+ return ret;
+ }
+
+ vgic_lpi_translation_cache_init(dev->kvm);
+ }
+
+ mutex_init(&its->its_lock);
+ mutex_init(&its->cmd_lock);
+
+ its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&its->device_list);
+ INIT_LIST_HEAD(&its->collection_list);
+
+ dev->kvm->arch.vgic.msis_require_devid = true;
+ dev->kvm->arch.vgic.has_its = true;
+ its->enabled = false;
+ its->dev = dev;
+
+ its->baser_device_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+ its->baser_coll_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+ dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+ dev->private = its;
+
+ return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+ struct kvm *kvm = kvm_dev->kvm;
+ struct vgic_its *its = kvm_dev->private;
+
+ mutex_lock(&its->its_lock);
+
+ vgic_its_free_device_list(kvm, its);
+ vgic_its_free_collection_list(kvm, its);
+
+ mutex_unlock(&its->its_lock);
+ kfree(its);
+ kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
+}
+
+static int vgic_its_has_attr_regs(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ const struct vgic_register_region *region;
+ gpa_t offset = attr->attr;
+ int align;
+
+ align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;
+
+ if (offset & align)
+ return -EINVAL;
+
+ region = vgic_find_mmio_region(its_registers,
+ ARRAY_SIZE(its_registers),
+ offset);
+ if (!region)
+ return -ENXIO;
+
+ return 0;
+}
+
+static int vgic_its_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u64 *reg, bool is_write)
+{
+ const struct vgic_register_region *region;
+ struct vgic_its *its;
+ gpa_t addr, offset;
+ unsigned int len;
+ int align, ret = 0;
+
+ its = dev->private;
+ offset = attr->attr;
+
+ /*
+ * Although the spec supports upper/lower 32-bit accesses to
+ * 64-bit ITS registers, the userspace ABI requires 64-bit
+ * accesses to all 64-bit wide registers. We therefore only
+ * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
+ * registers
+ */
+ if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
+ align = 0x3;
+ else
+ align = 0x7;
+
+ if (offset & align)
+ return -EINVAL;
+
+ mutex_lock(&dev->kvm->lock);
+
+ if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ region = vgic_find_mmio_region(its_registers,
+ ARRAY_SIZE(its_registers),
+ offset);
+ if (!region) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (!lock_all_vcpus(dev->kvm)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ addr = its->vgic_its_base + offset;
+
+ len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;
+
+ if (is_write) {
+ if (region->uaccess_its_write)
+ ret = region->uaccess_its_write(dev->kvm, its, addr,
+ len, *reg);
+ else
+ region->its_write(dev->kvm, its, addr, len, *reg);
+ } else {
+ *reg = region->its_read(dev->kvm, its, addr, len);
+ }
+ unlock_all_vcpus(dev->kvm);
+out:
+ mutex_unlock(&dev->kvm->lock);
+ return ret;
+}
+
+static u32 compute_next_devid_offset(struct list_head *h,
+ struct its_device *dev)
+{
+ struct its_device *next;
+ u32 next_offset;
+
+ if (list_is_last(&dev->dev_list, h))
+ return 0;
+ next = list_next_entry(dev, dev_list);
+ next_offset = next->device_id - dev->device_id;
+
+ return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
+}
+
+static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
+{
+ struct its_ite *next;
+ u32 next_offset;
+
+ if (list_is_last(&ite->ite_list, h))
+ return 0;
+ next = list_next_entry(ite, ite_list);
+ next_offset = next->event_id - ite->event_id;
+
+ return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
+}
+
+/**
+ * entry_fn_t - Callback called on a table entry restore path
+ * @its: its handle
+ * @id: id of the entry
+ * @entry: pointer to the entry
+ * @opaque: pointer to an opaque data
+ *
+ * Return: < 0 on error, 0 if last element was identified, id offset to next
+ * element otherwise
+ */
+typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
+ void *opaque);
+
+/**
+ * scan_its_table - Scan a contiguous table in guest RAM and applies a function
+ * to each entry
+ *
+ * @its: its handle
+ * @base: base gpa of the table
+ * @size: size of the table in bytes
+ * @esz: entry size in bytes
+ * @start_id: the ID of the first entry in the table
+ * (non zero for 2d level tables)
+ * @fn: function to apply on each entry
+ *
+ * Return: < 0 on error, 0 if last element was identified, 1 otherwise
+ * (the last element may not be found on second level tables)
+ */
+static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
+ int start_id, entry_fn_t fn, void *opaque)
+{
+ struct kvm *kvm = its->dev->kvm;
+ unsigned long len = size;
+ int id = start_id;
+ gpa_t gpa = base;
+ char entry[ESZ_MAX];
+ int ret;
+
+ memset(entry, 0, esz);
+
+ while (len > 0) {
+ int next_offset;
+ size_t byte_offset;
+
+ ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
+ if (ret)
+ return ret;
+
+ next_offset = fn(its, id, entry, opaque);
+ if (next_offset <= 0)
+ return next_offset;
+
+ byte_offset = next_offset * esz;
+ id += next_offset;
+ gpa += byte_offset;
+ len -= byte_offset;
+ }
+ return 1;
+}
+
+/**
+ * vgic_its_save_ite - Save an interrupt translation entry at @gpa
+ */
+static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
+ struct its_ite *ite, gpa_t gpa, int ite_esz)
+{
+ struct kvm *kvm = its->dev->kvm;
+ u32 next_offset;
+ u64 val;
+
+ next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
+ val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
+ ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
+ ite->collection->collection_id;
+ val = cpu_to_le64(val);
+ return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
+}
+
+/**
+ * vgic_its_restore_ite - restore an interrupt translation entry
+ * @event_id: id used for indexing
+ * @ptr: pointer to the ITE entry
+ * @opaque: pointer to the its_device
+ */
+static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
+ void *ptr, void *opaque)
+{
+ struct its_device *dev = (struct its_device *)opaque;
+ struct its_collection *collection;
+ struct kvm *kvm = its->dev->kvm;
+ struct kvm_vcpu *vcpu = NULL;
+ u64 val;
+ u64 *p = (u64 *)ptr;
+ struct vgic_irq *irq;
+ u32 coll_id, lpi_id;
+ struct its_ite *ite;
+ u32 offset;
+
+ val = *p;
+
+ val = le64_to_cpu(val);
+
+ coll_id = val & KVM_ITS_ITE_ICID_MASK;
+ lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;
+
+ if (!lpi_id)
+ return 1; /* invalid entry, no choice but to scan next entry */
+
+ if (lpi_id < VGIC_MIN_LPI)
+ return -EINVAL;
+
+ offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
+ if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
+ return -EINVAL;
+
+ collection = find_collection(its, coll_id);
+ if (!collection)
+ return -EINVAL;
+
+ ite = vgic_its_alloc_ite(dev, collection, event_id);
+ if (IS_ERR(ite))
+ return PTR_ERR(ite);
+
+ if (its_is_collection_mapped(collection))
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ irq = vgic_add_lpi(kvm, lpi_id, vcpu);
+ if (IS_ERR(irq))
+ return PTR_ERR(irq);
+ ite->irq = irq;
+
+ return offset;
+}
+
+static int vgic_its_ite_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct its_ite *itea = container_of(a, struct its_ite, ite_list);
+ struct its_ite *iteb = container_of(b, struct its_ite, ite_list);
+
+ if (itea->event_id < iteb->event_id)
+ return -1;
+ else
+ return 1;
+}
+
+static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ gpa_t base = device->itt_addr;
+ struct its_ite *ite;
+ int ret;
+ int ite_esz = abi->ite_esz;
+
+ list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);
+
+ list_for_each_entry(ite, &device->itt_head, ite_list) {
+ gpa_t gpa = base + ite->event_id * ite_esz;
+
+ /*
+ * If an LPI carries the HW bit, this means that this
+ * interrupt is controlled by GICv4, and we do not
+ * have direct access to that state. Let's simply fail
+ * the save operation...
+ */
+ if (ite->irq->hw)
+ return -EACCES;
+
+ ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * vgic_its_restore_itt - restore the ITT of a device
+ *
+ * @its: its handle
+ * @dev: device handle
+ *
+ * Return 0 on success, < 0 on error
+ */
+static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ gpa_t base = dev->itt_addr;
+ int ret;
+ int ite_esz = abi->ite_esz;
+ size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;
+
+ ret = scan_its_table(its, base, max_size, ite_esz, 0,
+ vgic_its_restore_ite, dev);
+
+ /* scan_its_table returns +1 if all ITEs are invalid */
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+/**
+ * vgic_its_save_dte - Save a device table entry at a given GPA
+ *
+ * @its: ITS handle
+ * @dev: ITS device
+ * @ptr: GPA
+ */
+static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
+ gpa_t ptr, int dte_esz)
+{
+ struct kvm *kvm = its->dev->kvm;
+ u64 val, itt_addr_field;
+ u32 next_offset;
+
+ itt_addr_field = dev->itt_addr >> 8;
+ next_offset = compute_next_devid_offset(&its->device_list, dev);
+ val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
+ ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
+ (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
+ (dev->num_eventid_bits - 1));
+ val = cpu_to_le64(val);
+ return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
+}
+
+/**
+ * vgic_its_restore_dte - restore a device table entry
+ *
+ * @its: its handle
+ * @id: device id the DTE corresponds to
+ * @ptr: kernel VA where the 8 byte DTE is located
+ * @opaque: unused
+ *
+ * Return: < 0 on error, 0 if the dte is the last one, id offset to the
+ * next dte otherwise
+ */
+static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
+ void *ptr, void *opaque)
+{
+ struct its_device *dev;
+ gpa_t itt_addr;
+ u8 num_eventid_bits;
+ u64 entry = *(u64 *)ptr;
+ bool valid;
+ u32 offset;
+ int ret;
+
+ entry = le64_to_cpu(entry);
+
+ valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
+ num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
+ itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
+ >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;
+
+ if (!valid)
+ return 1;
+
+ /* dte entry is valid */
+ offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;
+
+ dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
+ ret = vgic_its_restore_itt(its, dev);
+ if (ret) {
+ vgic_its_free_device(its->dev->kvm, dev);
+ return ret;
+ }
+
+ return offset;
+}
+
+static int vgic_its_device_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
+{
+ struct its_device *deva = container_of(a, struct its_device, dev_list);
+ struct its_device *devb = container_of(b, struct its_device, dev_list);
+
+ if (deva->device_id < devb->device_id)
+ return -1;
+ else
+ return 1;
+}
+
+/**
+ * vgic_its_save_device_tables - Save the device table and all ITT
+ * into guest RAM
+ *
+ * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
+ * returns the GPA of the device entry
+ */
+static int vgic_its_save_device_tables(struct vgic_its *its)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_device_table;
+ struct its_device *dev;
+ int dte_esz = abi->dte_esz;
+
+ if (!(baser & GITS_BASER_VALID))
+ return 0;
+
+ list_sort(NULL, &its->device_list, vgic_its_device_cmp);
+
+ list_for_each_entry(dev, &its->device_list, dev_list) {
+ int ret;
+ gpa_t eaddr;
+
+ if (!vgic_its_check_id(its, baser,
+ dev->device_id, &eaddr))
+ return -EINVAL;
+
+ ret = vgic_its_save_itt(its, dev);
+ if (ret)
+ return ret;
+
+ ret = vgic_its_save_dte(its, dev, eaddr, dte_esz);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * handle_l1_dte - callback used for L1 device table entries (2 stage case)
+ *
+ * @its: its handle
+ * @id: index of the entry in the L1 table
+ * @addr: kernel VA
+ * @opaque: unused
+ *
+ * L1 table entries are scanned by steps of 1 entry
+ * Return < 0 if error, 0 if last dte was found when scanning the L2
+ * table, +1 otherwise (meaning next L1 entry must be scanned)
+ */
+static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
+ void *opaque)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ int l2_start_id = id * (SZ_64K / abi->dte_esz);
+ u64 entry = *(u64 *)addr;
+ int dte_esz = abi->dte_esz;
+ gpa_t gpa;
+ int ret;
+
+ entry = le64_to_cpu(entry);
+
+ if (!(entry & KVM_ITS_L1E_VALID_MASK))
+ return 1;
+
+ gpa = entry & KVM_ITS_L1E_ADDR_MASK;
+
+ ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
+ l2_start_id, vgic_its_restore_dte, NULL);
+
+ return ret;
+}
+
+/**
+ * vgic_its_restore_device_tables - Restore the device table and all ITT
+ * from guest RAM to internal data structs
+ */
+static int vgic_its_restore_device_tables(struct vgic_its *its)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_device_table;
+ int l1_esz, ret;
+ int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+ gpa_t l1_gpa;
+
+ if (!(baser & GITS_BASER_VALID))
+ return 0;
+
+ l1_gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+ if (baser & GITS_BASER_INDIRECT) {
+ l1_esz = GITS_LVL1_ENTRY_SIZE;
+ ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+ handle_l1_dte, NULL);
+ } else {
+ l1_esz = abi->dte_esz;
+ ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
+ vgic_its_restore_dte, NULL);
+ }
+
+ /* scan_its_table returns +1 if all entries are invalid */
+ if (ret > 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int vgic_its_save_cte(struct vgic_its *its,
+ struct its_collection *collection,
+ gpa_t gpa, int esz)
+{
+ u64 val;
+
+ val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
+ ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
+ collection->collection_id);
+ val = cpu_to_le64(val);
+ return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
+}
+
+static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
+{
+ struct its_collection *collection;
+ struct kvm *kvm = its->dev->kvm;
+ u32 target_addr, coll_id;
+ u64 val;
+ int ret;
+
+ BUG_ON(esz > sizeof(val));
+ ret = kvm_read_guest_lock(kvm, gpa, &val, esz);
+ if (ret)
+ return ret;
+ val = le64_to_cpu(val);
+ if (!(val & KVM_ITS_CTE_VALID_MASK))
+ return 0;
+
+ target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
+ coll_id = val & KVM_ITS_CTE_ICID_MASK;
+
+ if (target_addr != COLLECTION_NOT_MAPPED &&
+ target_addr >= atomic_read(&kvm->online_vcpus))
+ return -EINVAL;
+
+ collection = find_collection(its, coll_id);
+ if (collection)
+ return -EEXIST;
+ ret = vgic_its_alloc_collection(its, &collection, coll_id);
+ if (ret)
+ return ret;
+ collection->target_addr = target_addr;
+ return 1;
+}
+
+/**
+ * vgic_its_save_collection_table - Save the collection table into
+ * guest RAM
+ */
+static int vgic_its_save_collection_table(struct vgic_its *its)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_coll_table;
+ gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
+ struct its_collection *collection;
+ u64 val;
+ size_t max_size, filled = 0;
+ int ret, cte_esz = abi->cte_esz;
+
+ if (!(baser & GITS_BASER_VALID))
+ return 0;
+
+ max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+ list_for_each_entry(collection, &its->collection_list, coll_list) {
+ ret = vgic_its_save_cte(its, collection, gpa, cte_esz);
+ if (ret)
+ return ret;
+ gpa += cte_esz;
+ filled += cte_esz;
+ }
+
+ if (filled == max_size)
+ return 0;
+
+ /*
+ * table is not fully filled, add a last dummy element
+ * with valid bit unset
+ */
+ val = 0;
+ BUG_ON(cte_esz > sizeof(val));
+ ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
+ return ret;
+}
+
+/**
+ * vgic_its_restore_collection_table - reads the collection table
+ * in guest memory and restores the ITS internal state. Requires the
+ * BASER registers to be restored before.
+ */
+static int vgic_its_restore_collection_table(struct vgic_its *its)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ u64 baser = its->baser_coll_table;
+ int cte_esz = abi->cte_esz;
+ size_t max_size, read = 0;
+ gpa_t gpa;
+ int ret;
+
+ if (!(baser & GITS_BASER_VALID))
+ return 0;
+
+ gpa = GITS_BASER_ADDR_48_to_52(baser);
+
+ max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+
+ while (read < max_size) {
+ ret = vgic_its_restore_cte(its, gpa, cte_esz);
+ if (ret <= 0)
+ break;
+ gpa += cte_esz;
+ read += cte_esz;
+ }
+
+ if (ret > 0)
+ return 0;
+
+ return ret;
+}
+
+/**
+ * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
+ * according to v0 ABI
+ */
+static int vgic_its_save_tables_v0(struct vgic_its *its)
+{
+ int ret;
+
+ ret = vgic_its_save_device_tables(its);
+ if (ret)
+ return ret;
+
+ return vgic_its_save_collection_table(its);
+}
+
+/**
+ * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
+ * to internal data structs according to V0 ABI
+ *
+ */
+static int vgic_its_restore_tables_v0(struct vgic_its *its)
+{
+ int ret;
+
+ ret = vgic_its_restore_collection_table(its);
+ if (ret)
+ return ret;
+
+ return vgic_its_restore_device_tables(its);
+}
+
+static int vgic_its_commit_v0(struct vgic_its *its)
+{
+ const struct vgic_its_abi *abi;
+
+ abi = vgic_its_get_abi(its);
+ its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+ its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+
+ its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
+ << GITS_BASER_ENTRY_SIZE_SHIFT);
+
+ its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
+ << GITS_BASER_ENTRY_SIZE_SHIFT);
+ return 0;
+}
+
+static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
+{
+ /* We need to keep the ABI specific field values */
+ its->baser_coll_table &= ~GITS_BASER_VALID;
+ its->baser_device_table &= ~GITS_BASER_VALID;
+ its->cbaser = 0;
+ its->creadr = 0;
+ its->cwriter = 0;
+ its->enabled = 0;
+ vgic_its_free_device_list(kvm, its);
+ vgic_its_free_collection_list(kvm, its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_ITS_ADDR_TYPE:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ case KVM_DEV_ARM_ITS_CTRL_RESET:
+ return 0;
+ case KVM_DEV_ARM_ITS_SAVE_TABLES:
+ return 0;
+ case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
+ return vgic_its_has_attr_regs(dev, attr);
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
+{
+ const struct vgic_its_abi *abi = vgic_its_get_abi(its);
+ int ret = 0;
+
+ if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
+ return 0;
+
+ mutex_lock(&kvm->lock);
+ mutex_lock(&its->its_lock);
+
+ if (!lock_all_vcpus(kvm)) {
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+
+ switch (attr) {
+ case KVM_DEV_ARM_ITS_CTRL_RESET:
+ vgic_its_reset(kvm, its);
+ break;
+ case KVM_DEV_ARM_ITS_SAVE_TABLES:
+ ret = abi->save_tables(its);
+ break;
+ case KVM_DEV_ARM_ITS_RESTORE_TABLES:
+ ret = abi->restore_tables(its);
+ break;
+ }
+
+ unlock_all_vcpus(kvm);
+ mutex_unlock(&its->its_lock);
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct vgic_its *its = dev->private;
+ int ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+ u64 addr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+ addr, SZ_64K);
+ if (ret)
+ return ret;
+
+ return vgic_register_its_iodev(dev->kvm, its, addr);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ return vgic_its_ctrl(dev->kvm, its, attr->attr);
+ case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 reg;
+
+ if (get_user(reg, uaddr))
+ return -EFAULT;
+
+ return vgic_its_attr_regs_access(dev, attr, &reg, true);
+ }
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ struct vgic_its *its = dev->private;
+ u64 addr = its->vgic_its_base;
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+ break;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 reg;
+ int ret;
+
+ ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
+ if (ret)
+ return ret;
+ return put_user(reg, uaddr);
+ }
+ default:
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+ .name = "kvm-arm-vgic-its",
+ .create = vgic_its_create,
+ .destroy = vgic_its_destroy,
+ .set_attr = vgic_its_set_attr,
+ .get_attr = vgic_its_get_attr,
+ .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+ return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+ KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
new file mode 100644
index 000000000000..44419679f91a
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -0,0 +1,741 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC: KVM DEVICE API
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <linux/uaccess.h>
+#include <asm/kvm_mmu.h>
+#include <asm/cputype.h>
+#include "vgic.h"
+
+/* common helpers */
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment)
+{
+ if (addr & ~kvm_phys_mask(kvm))
+ return -E2BIG;
+
+ if (!IS_ALIGNED(addr, alignment))
+ return -EINVAL;
+
+ if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
+ return -EEXIST;
+
+ return 0;
+}
+
+static int vgic_check_type(struct kvm *kvm, int type_needed)
+{
+ if (kvm->arch.vgic.vgic_model != type_needed)
+ return -ENODEV;
+ else
+ return 0;
+}
+
+/**
+ * kvm_vgic_addr - set or get vgic VM base addresses
+ * @kvm: pointer to the vm struct
+ * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
+ * @addr: pointer to address value
+ * @write: if true set the address in the VM address space, if false read the
+ * address
+ *
+ * Set or get the vgic base addresses for the distributor and the virtual CPU
+ * interface in the VM physical address space. These addresses are properties
+ * of the emulated core/SoC and therefore user space initially knows this
+ * information.
+ * Check them for sanity (alignment, double assignment). We can't check for
+ * overlapping regions in case of a virtual GICv3 here, since we don't know
+ * the number of VCPUs yet, so we defer this check to map_resources().
+ */
+int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
+{
+ int r = 0;
+ struct vgic_dist *vgic = &kvm->arch.vgic;
+ phys_addr_t *addr_ptr, alignment;
+ u64 undef_value = VGIC_ADDR_UNDEF;
+
+ mutex_lock(&kvm->lock);
+ switch (type) {
+ case KVM_VGIC_V2_ADDR_TYPE_DIST:
+ r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+ addr_ptr = &vgic->vgic_dist_base;
+ alignment = SZ_4K;
+ break;
+ case KVM_VGIC_V2_ADDR_TYPE_CPU:
+ r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+ addr_ptr = &vgic->vgic_cpu_base;
+ alignment = SZ_4K;
+ break;
+ case KVM_VGIC_V3_ADDR_TYPE_DIST:
+ r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+ addr_ptr = &vgic->vgic_dist_base;
+ alignment = SZ_64K;
+ break;
+ case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
+ struct vgic_redist_region *rdreg;
+
+ r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (r)
+ break;
+ if (write) {
+ r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
+ goto out;
+ }
+ rdreg = list_first_entry(&vgic->rd_regions,
+ struct vgic_redist_region, list);
+ if (!rdreg)
+ addr_ptr = &undef_value;
+ else
+ addr_ptr = &rdreg->base;
+ break;
+ }
+ case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+ {
+ struct vgic_redist_region *rdreg;
+ u8 index;
+
+ r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (r)
+ break;
+
+ index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK;
+
+ if (write) {
+ gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK;
+ u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK)
+ >> KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+ u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK)
+ >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT;
+
+ if (!count || flags)
+ r = -EINVAL;
+ else
+ r = vgic_v3_set_redist_base(kvm, index,
+ base, count);
+ goto out;
+ }
+
+ rdreg = vgic_v3_rdist_region_from_index(kvm, index);
+ if (!rdreg) {
+ r = -ENOENT;
+ goto out;
+ }
+
+ *addr = index;
+ *addr |= rdreg->base;
+ *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
+ goto out;
+ }
+ default:
+ r = -ENODEV;
+ }
+
+ if (r)
+ goto out;
+
+ if (write) {
+ r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment);
+ if (!r)
+ *addr_ptr = *addr;
+ } else {
+ *addr = *addr_ptr;
+ }
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int vgic_set_common_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ r = kvm_vgic_addr(dev->kvm, type, &addr, true);
+ return (r == -ENODEV) ? -ENXIO : r;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 val;
+ int ret = 0;
+
+ if (get_user(val, uaddr))
+ return -EFAULT;
+
+ /*
+ * We require:
+ * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
+ * - at most 1024 interrupts
+ * - a multiple of 32 interrupts
+ */
+ if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
+ val > VGIC_MAX_RESERVED ||
+ (val & 31))
+ return -EINVAL;
+
+ mutex_lock(&dev->kvm->lock);
+
+ if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis)
+ ret = -EBUSY;
+ else
+ dev->kvm->arch.vgic.nr_spis =
+ val - VGIC_NR_PRIVATE_IRQS;
+
+ mutex_unlock(&dev->kvm->lock);
+
+ return ret;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ mutex_lock(&dev->kvm->lock);
+ r = vgic_init(dev->kvm);
+ mutex_unlock(&dev->kvm->lock);
+ return r;
+ }
+ break;
+ }
+ }
+
+ return -ENXIO;
+}
+
+static int vgic_get_common_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int r = -ENXIO;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ r = kvm_vgic_addr(dev->kvm, type, &addr, false);
+ if (r)
+ return (r == -ENODEV) ? -ENXIO : r;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+ break;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+
+ r = put_user(dev->kvm->arch.vgic.nr_spis +
+ VGIC_NR_PRIVATE_IRQS, uaddr);
+ break;
+ }
+ }
+
+ return r;
+}
+
+static int vgic_create(struct kvm_device *dev, u32 type)
+{
+ return kvm_vgic_create(dev->kvm, type);
+}
+
+static void vgic_destroy(struct kvm_device *dev)
+{
+ kfree(dev);
+}
+
+int kvm_register_vgic_device(unsigned long type)
+{
+ int ret = -ENODEV;
+
+ switch (type) {
+ case KVM_DEV_TYPE_ARM_VGIC_V2:
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V2);
+ break;
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V3);
+
+ if (ret)
+ break;
+ ret = kvm_vgic_register_its_device();
+ break;
+ }
+
+ return ret;
+}
+
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+ struct vgic_reg_attr *reg_attr)
+{
+ int cpuid;
+
+ cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
+ KVM_DEV_ARM_VGIC_CPUID_SHIFT;
+
+ if (cpuid >= atomic_read(&dev->kvm->online_vcpus))
+ return -EINVAL;
+
+ reg_attr->vcpu = kvm_get_vcpu(dev->kvm, cpuid);
+ reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+ return 0;
+}
+
+/* unlocks vcpus from @vcpu_lock_idx and smaller */
+static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
+{
+ struct kvm_vcpu *tmp_vcpu;
+
+ for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
+ tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
+ mutex_unlock(&tmp_vcpu->mutex);
+ }
+}
+
+void unlock_all_vcpus(struct kvm *kvm)
+{
+ unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
+}
+
+/* Returns true if all vcpus were locked, false otherwise */
+bool lock_all_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *tmp_vcpu;
+ int c;
+
+ /*
+ * Any time a vcpu is run, vcpu_load is called which tries to grab the
+ * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
+ * that no other VCPUs are run and fiddle with the vgic state while we
+ * access it.
+ */
+ kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
+ if (!mutex_trylock(&tmp_vcpu->mutex)) {
+ unlock_vcpus(kvm, c - 1);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
+ *
+ * @dev: kvm device handle
+ * @attr: kvm device attribute
+ * @reg: address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v2_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u32 *reg, bool is_write)
+{
+ struct vgic_reg_attr reg_attr;
+ gpa_t addr;
+ struct kvm_vcpu *vcpu;
+ int ret;
+
+ ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+ if (ret)
+ return ret;
+
+ vcpu = reg_attr.vcpu;
+ addr = reg_attr.addr;
+
+ mutex_lock(&dev->kvm->lock);
+
+ ret = vgic_init(dev->kvm);
+ if (ret)
+ goto out;
+
+ if (!lock_all_vcpus(dev->kvm)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+ ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg);
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ unlock_all_vcpus(dev->kvm);
+out:
+ mutex_unlock(&dev->kvm->lock);
+ return ret;
+}
+
+static int vgic_v2_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int ret;
+
+ ret = vgic_set_common_attr(dev, attr);
+ if (ret != -ENXIO)
+ return ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 reg;
+
+ if (get_user(reg, uaddr))
+ return -EFAULT;
+
+ return vgic_v2_attr_regs_access(dev, attr, &reg, true);
+ }
+ }
+
+ return -ENXIO;
+}
+
+static int vgic_v2_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int ret;
+
+ ret = vgic_get_common_attr(dev, attr);
+ if (ret != -ENXIO)
+ return ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 reg = 0;
+
+ ret = vgic_v2_attr_regs_access(dev, attr, &reg, false);
+ if (ret)
+ return ret;
+ return put_user(reg, uaddr);
+ }
+ }
+
+ return -ENXIO;
+}
+
+static int vgic_v2_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_V2_ADDR_TYPE_DIST:
+ case KVM_VGIC_V2_ADDR_TYPE_CPU:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+ return vgic_v2_has_attr_regs(dev, attr);
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ return 0;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ }
+ }
+ return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v2_ops = {
+ .name = "kvm-arm-vgic-v2",
+ .create = vgic_create,
+ .destroy = vgic_destroy,
+ .set_attr = vgic_v2_set_attr,
+ .get_attr = vgic_v2_get_attr,
+ .has_attr = vgic_v2_has_attr,
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+ struct vgic_reg_attr *reg_attr)
+{
+ unsigned long vgic_mpidr, mpidr_reg;
+
+ /*
+ * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
+ * attr might not hold MPIDR. Hence assume vcpu0.
+ */
+ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
+ vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
+ KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;
+
+ mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
+ reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
+ } else {
+ reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
+ }
+
+ if (!reg_attr->vcpu)
+ return -EINVAL;
+
+ reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
+
+ return 0;
+}
+
+/*
+ * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
+ *
+ * @dev: kvm device handle
+ * @attr: kvm device attribute
+ * @reg: address the value is read or written
+ * @is_write: true if userspace is writing a register
+ */
+static int vgic_v3_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u64 *reg, bool is_write)
+{
+ struct vgic_reg_attr reg_attr;
+ gpa_t addr;
+ struct kvm_vcpu *vcpu;
+ int ret;
+ u32 tmp32;
+
+ ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+ if (ret)
+ return ret;
+
+ vcpu = reg_attr.vcpu;
+ addr = reg_attr.addr;
+
+ mutex_lock(&dev->kvm->lock);
+
+ if (unlikely(!vgic_initialized(dev->kvm))) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (!lock_all_vcpus(dev->kvm)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ if (is_write)
+ tmp32 = *reg;
+
+ ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32);
+ if (!is_write)
+ *reg = tmp32;
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+ if (is_write)
+ tmp32 = *reg;
+
+ ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32);
+ if (!is_write)
+ *reg = tmp32;
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+ u64 regid;
+
+ regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+ ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write,
+ regid, reg);
+ break;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+ unsigned int info, intid;
+
+ info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+ KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
+ if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
+ intid = attr->attr &
+ KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
+ ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
+ intid, reg);
+ } else {
+ ret = -EINVAL;
+ }
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ unlock_all_vcpus(dev->kvm);
+out:
+ mutex_unlock(&dev->kvm->lock);
+ return ret;
+}
+
+static int vgic_v3_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int ret;
+
+ ret = vgic_set_common_attr(dev, attr);
+ if (ret != -ENXIO)
+ return ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u32 tmp32;
+ u64 reg;
+
+ if (get_user(tmp32, uaddr))
+ return -EFAULT;
+
+ reg = tmp32;
+ return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 reg;
+
+ if (get_user(reg, uaddr))
+ return -EFAULT;
+
+ return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u64 reg;
+ u32 tmp32;
+
+ if (get_user(tmp32, uaddr))
+ return -EFAULT;
+
+ reg = tmp32;
+ return vgic_v3_attr_regs_access(dev, attr, &reg, true);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL: {
+ int ret;
+
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+ mutex_lock(&dev->kvm->lock);
+
+ if (!lock_all_vcpus(dev->kvm)) {
+ mutex_unlock(&dev->kvm->lock);
+ return -EBUSY;
+ }
+ ret = vgic_v3_save_pending_tables(dev->kvm);
+ unlock_all_vcpus(dev->kvm);
+ mutex_unlock(&dev->kvm->lock);
+ return ret;
+ }
+ break;
+ }
+ }
+ return -ENXIO;
+}
+
+static int vgic_v3_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ int ret;
+
+ ret = vgic_get_common_attr(dev, attr);
+ if (ret != -ENXIO)
+ return ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u64 reg;
+ u32 tmp32;
+
+ ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+ if (ret)
+ return ret;
+ tmp32 = reg;
+ return put_user(tmp32, uaddr);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ u64 reg;
+
+ ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+ if (ret)
+ return ret;
+ return put_user(reg, uaddr);
+ }
+ case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+ u32 __user *uaddr = (u32 __user *)(long)attr->addr;
+ u64 reg;
+ u32 tmp32;
+
+ ret = vgic_v3_attr_regs_access(dev, attr, &reg, false);
+ if (ret)
+ return ret;
+ tmp32 = reg;
+ return put_user(tmp32, uaddr);
+ }
+ }
+ return -ENXIO;
+}
+
+static int vgic_v3_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_V3_ADDR_TYPE_DIST:
+ case KVM_VGIC_V3_ADDR_TYPE_REDIST:
+ case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
+ return vgic_v3_has_attr_regs(dev, attr);
+ case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
+ return 0;
+ case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
+ if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
+ KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
+ VGIC_LEVEL_INFO_LINE_LEVEL)
+ return 0;
+ break;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
+ return 0;
+ }
+ }
+ return -ENXIO;
+}
+
+struct kvm_device_ops kvm_arm_vgic_v3_ops = {
+ .name = "kvm-arm-vgic-v3",
+ .create = vgic_create,
+ .destroy = vgic_destroy,
+ .set_attr = vgic_v3_set_attr,
+ .get_attr = vgic_v3_get_attr,
+ .has_attr = vgic_v3_has_attr,
+};
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
new file mode 100644
index 000000000000..a016f07adc28
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv2 MMIO handling functions
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/nospec.h>
+
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 1: Report GICv2 interrupts as group 0 instead of group 1
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ * their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v2_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+ u32 value;
+
+ switch (addr & 0x0c) {
+ case GIC_DIST_CTRL:
+ value = vgic->enabled ? GICD_ENABLE : 0;
+ break;
+ case GIC_DIST_CTR:
+ value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+ value = (value >> 5) - 1;
+ value |= (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
+ break;
+ case GIC_DIST_IIDR:
+ value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+ (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+ (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+ break;
+ default:
+ return 0;
+ }
+
+ return value;
+}
+
+static void vgic_mmio_write_v2_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ bool was_enabled = dist->enabled;
+
+ switch (addr & 0x0c) {
+ case GIC_DIST_CTRL:
+ dist->enabled = val & GICD_ENABLE;
+ if (!was_enabled && dist->enabled)
+ vgic_kick_vcpus(vcpu->kvm);
+ break;
+ case GIC_DIST_CTR:
+ case GIC_DIST_IIDR:
+ /* Nothing to do */
+ return;
+ }
+}
+
+static int vgic_mmio_uaccess_write_v2_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ switch (addr & 0x0c) {
+ case GIC_DIST_IIDR:
+ if (val != vgic_mmio_read_v2_misc(vcpu, addr, len))
+ return -EINVAL;
+
+ /*
+ * If we observe a write to GICD_IIDR we know that userspace
+ * has been updated and has had a chance to cope with older
+ * kernels (VGICv2 IIDR.Revision == 0) incorrectly reporting
+ * interrupts as group 1, and therefore we now allow groups to
+ * be user writable. Doing this by default would break
+ * migration from old kernels to new kernels with legacy
+ * userspace.
+ */
+ vcpu->kvm->arch.vgic.v2_groups_user_writable = true;
+ return 0;
+ }
+
+ vgic_mmio_write_v2_misc(vcpu, addr, len, val);
+ return 0;
+}
+
+static int vgic_mmio_uaccess_write_v2_group(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ if (vcpu->kvm->arch.vgic.v2_groups_user_writable)
+ vgic_mmio_write_group(vcpu, addr, len, val);
+
+ return 0;
+}
+
+static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ int nr_vcpus = atomic_read(&source_vcpu->kvm->online_vcpus);
+ int intid = val & 0xf;
+ int targets = (val >> 16) & 0xff;
+ int mode = (val >> 24) & 0x03;
+ int c;
+ struct kvm_vcpu *vcpu;
+ unsigned long flags;
+
+ switch (mode) {
+ case 0x0: /* as specified by targets */
+ break;
+ case 0x1:
+ targets = (1U << nr_vcpus) - 1; /* all, ... */
+ targets &= ~(1U << source_vcpu->vcpu_id); /* but self */
+ break;
+ case 0x2: /* this very vCPU only */
+ targets = (1U << source_vcpu->vcpu_id);
+ break;
+ case 0x3: /* reserved */
+ return;
+ }
+
+ kvm_for_each_vcpu(c, vcpu, source_vcpu->kvm) {
+ struct vgic_irq *irq;
+
+ if (!(targets & (1U << c)))
+ continue;
+
+ irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = true;
+ irq->source |= 1U << source_vcpu->vcpu_id;
+
+ vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags);
+ vgic_put_irq(source_vcpu->kvm, irq);
+ }
+}
+
+static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+ int i;
+ u64 val = 0;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ val |= (u64)irq->targets << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return val;
+}
+
+static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+ u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0);
+ int i;
+ unsigned long flags;
+
+ /* GICD_ITARGETSR[0-7] are read-only */
+ if (intid < VGIC_NR_PRIVATE_IRQS)
+ return;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
+ int target;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ irq->targets = (val >> (i * 8)) & cpu_mask;
+ target = irq->targets ? __ffs(irq->targets) : 0;
+ irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = addr & 0x0f;
+ int i;
+ u64 val = 0;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ val |= (u64)irq->source << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+ return val;
+}
+
+static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = addr & 0x0f;
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ irq->source &= ~((val >> (i * 8)) & 0xff);
+ if (!irq->source)
+ irq->pending_latch = false;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = addr & 0x0f;
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ irq->source |= (val >> (i * 8)) & 0xff;
+
+ if (irq->source) {
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+#define GICC_ARCH_VERSION_V2 0x2
+
+/* These are for userland accesses only, there is no guest-facing emulation. */
+static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_vmcr vmcr;
+ u32 val;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+
+ switch (addr & 0xff) {
+ case GIC_CPU_CTRL:
+ val = vmcr.grpen0 << GIC_CPU_CTRL_EnableGrp0_SHIFT;
+ val |= vmcr.grpen1 << GIC_CPU_CTRL_EnableGrp1_SHIFT;
+ val |= vmcr.ackctl << GIC_CPU_CTRL_AckCtl_SHIFT;
+ val |= vmcr.fiqen << GIC_CPU_CTRL_FIQEn_SHIFT;
+ val |= vmcr.cbpr << GIC_CPU_CTRL_CBPR_SHIFT;
+ val |= vmcr.eoim << GIC_CPU_CTRL_EOImodeNS_SHIFT;
+
+ break;
+ case GIC_CPU_PRIMASK:
+ /*
+ * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+ * the PMR field as GICH_VMCR.VMPriMask rather than
+ * GICC_PMR.Priority, so we expose the upper five bits of
+ * priority mask to userspace using the lower bits in the
+ * unsigned long.
+ */
+ val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+ GICV_PMR_PRIORITY_SHIFT;
+ break;
+ case GIC_CPU_BINPOINT:
+ val = vmcr.bpr;
+ break;
+ case GIC_CPU_ALIAS_BINPOINT:
+ val = vmcr.abpr;
+ break;
+ case GIC_CPU_IDENT:
+ val = ((PRODUCT_ID_KVM << 20) |
+ (GICC_ARCH_VERSION_V2 << 16) |
+ IMPLEMENTER_ARM);
+ break;
+ default:
+ return 0;
+ }
+
+ return val;
+}
+
+static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_vmcr vmcr;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+
+ switch (addr & 0xff) {
+ case GIC_CPU_CTRL:
+ vmcr.grpen0 = !!(val & GIC_CPU_CTRL_EnableGrp0);
+ vmcr.grpen1 = !!(val & GIC_CPU_CTRL_EnableGrp1);
+ vmcr.ackctl = !!(val & GIC_CPU_CTRL_AckCtl);
+ vmcr.fiqen = !!(val & GIC_CPU_CTRL_FIQEn);
+ vmcr.cbpr = !!(val & GIC_CPU_CTRL_CBPR);
+ vmcr.eoim = !!(val & GIC_CPU_CTRL_EOImodeNS);
+
+ break;
+ case GIC_CPU_PRIMASK:
+ /*
+ * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+ * the PMR field as GICH_VMCR.VMPriMask rather than
+ * GICC_PMR.Priority, so we expose the upper five bits of
+ * priority mask to userspace using the lower bits in the
+ * unsigned long.
+ */
+ vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+ GICV_PMR_PRIORITY_MASK;
+ break;
+ case GIC_CPU_BINPOINT:
+ vmcr.bpr = val;
+ break;
+ case GIC_CPU_ALIAS_BINPOINT:
+ vmcr.abpr = val;
+ break;
+ }
+
+ vgic_set_vmcr(vcpu, &vmcr);
+}
+
+static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ int n; /* which APRn is this */
+
+ n = (addr >> 2) & 0x3;
+
+ if (kvm_vgic_global_state.type == VGIC_V2) {
+ /* GICv2 hardware systems support max. 32 groups */
+ if (n != 0)
+ return 0;
+ return vcpu->arch.vgic_cpu.vgic_v2.vgic_apr;
+ } else {
+ struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ if (n > vgic_v3_max_apr_idx(vcpu))
+ return 0;
+
+ n = array_index_nospec(n, 4);
+
+ /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+ return vgicv3->vgic_ap1r[n];
+ }
+}
+
+static void vgic_mmio_write_apr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ int n; /* which APRn is this */
+
+ n = (addr >> 2) & 0x3;
+
+ if (kvm_vgic_global_state.type == VGIC_V2) {
+ /* GICv2 hardware systems support max. 32 groups */
+ if (n != 0)
+ return;
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_apr = val;
+ } else {
+ struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ if (n > vgic_v3_max_apr_idx(vcpu))
+ return;
+
+ n = array_index_nospec(n, 4);
+
+ /* GICv3 only uses ICH_AP1Rn for memory mapped (GICv2) guests */
+ vgicv3->vgic_ap1r[n] = val;
+ }
+}
+
+static const struct vgic_register_region vgic_v2_dist_registers[] = {
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_DIST_CTRL,
+ vgic_mmio_read_v2_misc, vgic_mmio_write_v2_misc,
+ NULL, vgic_mmio_uaccess_write_v2_misc,
+ 12, VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_IGROUP,
+ vgic_mmio_read_group, vgic_mmio_write_group,
+ NULL, vgic_mmio_uaccess_write_v2_group, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_SET,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ENABLE_CLEAR,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
+ vgic_mmio_read_pending, vgic_mmio_write_spending,
+ NULL, vgic_uaccess_write_spending, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
+ vgic_mmio_read_pending, vgic_mmio_write_cpending,
+ NULL, vgic_uaccess_write_cpending, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
+ vgic_mmio_read_active, vgic_mmio_write_sactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
+ vgic_mmio_read_active, vgic_mmio_write_cactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
+ vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+ 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_TARGET,
+ vgic_mmio_read_target, vgic_mmio_write_target, NULL, NULL, 8,
+ VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_CONFIG,
+ vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_DIST_SOFTINT,
+ vgic_mmio_read_raz, vgic_mmio_write_sgir, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_CLEAR,
+ vgic_mmio_read_sgipend, vgic_mmio_write_sgipendc, 16,
+ VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_DIST_SGI_PENDING_SET,
+ vgic_mmio_read_sgipend, vgic_mmio_write_sgipends, 16,
+ VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+};
+
+static const struct vgic_register_region vgic_v2_cpu_registers[] = {
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_CTRL,
+ vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_PRIMASK,
+ vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_BINPOINT,
+ vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_ALIAS_BINPOINT,
+ vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_ACTIVEPRIO,
+ vgic_mmio_read_apr, vgic_mmio_write_apr, 16,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
+ vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
+ VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
+{
+ dev->regions = vgic_v2_dist_registers;
+ dev->nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+
+ kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+ return SZ_4K;
+}
+
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ const struct vgic_register_region *region;
+ struct vgic_io_device iodev;
+ struct vgic_reg_attr reg_attr;
+ struct kvm_vcpu *vcpu;
+ gpa_t addr;
+ int ret;
+
+ ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
+ if (ret)
+ return ret;
+
+ vcpu = reg_attr.vcpu;
+ addr = reg_attr.addr;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ iodev.regions = vgic_v2_dist_registers;
+ iodev.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers);
+ iodev.base_addr = 0;
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
+ iodev.regions = vgic_v2_cpu_registers;
+ iodev.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+ iodev.base_addr = 0;
+ break;
+ default:
+ return -ENXIO;
+ }
+
+ /* We only support aligned 32-bit accesses. */
+ if (addr & 3)
+ return -ENXIO;
+
+ region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+ if (!region)
+ return -ENXIO;
+
+ return 0;
+}
+
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val)
+{
+ struct vgic_io_device dev = {
+ .regions = vgic_v2_cpu_registers,
+ .nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+ .iodev_type = IODEV_CPUIF,
+ };
+
+ return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val)
+{
+ struct vgic_io_device dev = {
+ .regions = vgic_v2_dist_registers,
+ .nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+ .iodev_type = IODEV_DIST,
+ };
+
+ return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
new file mode 100644
index 000000000000..d2339a2b9fb9
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -0,0 +1,1063 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGICv3 MMIO handling functions
+ */
+
+#include <linux/bitfield.h>
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_vgic.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/* extract @num bytes at @offset bytes offset in data */
+unsigned long extract_bytes(u64 data, unsigned int offset,
+ unsigned int num)
+{
+ return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
+}
+
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val)
+{
+ int lower = (offset & 4) * 8;
+ int upper = lower + 8 * len - 1;
+
+ reg &= ~GENMASK_ULL(upper, lower);
+ val &= GENMASK_ULL(len * 8 - 1, 0);
+
+ return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return false;
+
+ return dist->has_its;
+}
+
+bool vgic_supports_direct_msis(struct kvm *kvm)
+{
+ return (kvm_vgic_global_state.has_gicv4_1 ||
+ (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
+}
+
+/*
+ * The Revision field in the IIDR have the following meanings:
+ *
+ * Revision 2: Interrupt groups are guest-configurable and signaled using
+ * their configured groups.
+ */
+
+static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
+ u32 value = 0;
+
+ switch (addr & 0x0c) {
+ case GICD_CTLR:
+ if (vgic->enabled)
+ value |= GICD_CTLR_ENABLE_SS_G1;
+ value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
+ if (vgic->nassgireq)
+ value |= GICD_CTLR_nASSGIreq;
+ break;
+ case GICD_TYPER:
+ value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
+ value = (value >> 5) - 1;
+ if (vgic_has_its(vcpu->kvm)) {
+ value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+ value |= GICD_TYPER_LPIS;
+ } else {
+ value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+ }
+ break;
+ case GICD_TYPER2:
+ if (kvm_vgic_global_state.has_gicv4_1)
+ value = GICD_TYPER2_nASSGIcap;
+ break;
+ case GICD_IIDR:
+ value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
+ (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
+ (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
+ break;
+ default:
+ return 0;
+ }
+
+ return value;
+}
+
+static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ switch (addr & 0x0c) {
+ case GICD_CTLR: {
+ bool was_enabled, is_hwsgi;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ was_enabled = dist->enabled;
+ is_hwsgi = dist->nassgireq;
+
+ dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+
+ /* Not a GICv4.1? No HW SGIs */
+ if (!kvm_vgic_global_state.has_gicv4_1)
+ val &= ~GICD_CTLR_nASSGIreq;
+
+ /* Dist stays enabled? nASSGIreq is RO */
+ if (was_enabled && dist->enabled) {
+ val &= ~GICD_CTLR_nASSGIreq;
+ val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
+ }
+
+ /* Switching HW SGIs? */
+ dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+ if (is_hwsgi != dist->nassgireq)
+ vgic_v4_configure_vsgis(vcpu->kvm);
+
+ if (kvm_vgic_global_state.has_gicv4_1 &&
+ was_enabled != dist->enabled)
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
+ else if (!was_enabled && dist->enabled)
+ vgic_kick_vcpus(vcpu->kvm);
+
+ mutex_unlock(&vcpu->kvm->lock);
+ break;
+ }
+ case GICD_TYPER:
+ case GICD_TYPER2:
+ case GICD_IIDR:
+ /* This is at best for documentation purposes... */
+ return;
+ }
+}
+
+static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ switch (addr & 0x0c) {
+ case GICD_TYPER2:
+ case GICD_IIDR:
+ if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+ return -EINVAL;
+ return 0;
+ case GICD_CTLR:
+ /* Not a GICv4.1? No HW SGIs */
+ if (!kvm_vgic_global_state.has_gicv4_1)
+ val &= ~GICD_CTLR_nASSGIreq;
+
+ dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
+ dist->nassgireq = val & GICD_CTLR_nASSGIreq;
+ return 0;
+ }
+
+ vgic_mmio_write_v3_misc(vcpu, addr, len, val);
+ return 0;
+}
+
+static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ int intid = VGIC_ADDR_TO_INTID(addr, 64);
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ unsigned long ret = 0;
+
+ if (!irq)
+ return 0;
+
+ /* The upper word is RAZ for us. */
+ if (!(addr & 4))
+ ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ return ret;
+}
+
+static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ int intid = VGIC_ADDR_TO_INTID(addr, 64);
+ struct vgic_irq *irq;
+ unsigned long flags;
+
+ /* The upper word is WI for us since we don't implement Aff3. */
+ if (addr & 4)
+ return;
+
+ irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+ if (!irq)
+ return;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ /* We only care about and preserve Aff0, Aff1 and Aff2. */
+ irq->mpidr = val & GENMASK(23, 0);
+ irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ bool was_enabled = vgic_cpu->lpis_enabled;
+
+ if (!vgic_has_its(vcpu->kvm))
+ return;
+
+ vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+ if (was_enabled && !vgic_cpu->lpis_enabled) {
+ vgic_flush_pending_lpis(vcpu);
+ vgic_its_invalidate_cache(vcpu->kvm);
+ }
+
+ if (!was_enabled && vgic_cpu->lpis_enabled)
+ vgic_enable_lpis(vcpu);
+}
+
+static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_redist_region *rdreg = vgic_cpu->rdreg;
+ int target_vcpu_id = vcpu->vcpu_id;
+ gpa_t last_rdist_typer = rdreg->base + GICR_TYPER +
+ (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE;
+ u64 value;
+
+ value = (u64)(mpidr & GENMASK(23, 0)) << 32;
+ value |= ((target_vcpu_id & 0xffff) << 8);
+
+ if (addr == last_rdist_typer)
+ value |= GICR_TYPER_LAST;
+ if (vgic_has_its(vcpu->kvm))
+ value |= GICR_TYPER_PLPIS;
+
+ return extract_bytes(value, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ switch (addr & 0xffff) {
+ case GICD_PIDR2:
+ /* report a GICv3 compliant implementation */
+ return 0x3b;
+ }
+
+ return 0;
+}
+
+static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 value = 0;
+ int i;
+
+ /*
+ * pending state of interrupt is latched in pending_latch variable.
+ * Userspace will save and restore pending state and line_level
+ * separately.
+ * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
+ * for handling of ISPENDR and ICPENDR.
+ */
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ bool state = irq->pending_latch;
+
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ int err;
+
+ err = irq_get_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ &state);
+ WARN_ON(err);
+ }
+
+ if (state)
+ value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (test_bit(i, &val)) {
+ /*
+ * pending_latch is set irrespective of irq type
+ * (level or edge) to avoid dependency that VM should
+ * restore irq config before pending info.
+ */
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ } else {
+ irq->pending_latch = false;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_OuterShareable:
+ return GIC_BASER_InnerShareable;
+ default:
+ return field;
+ }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_nCnB:
+ case GIC_BASER_CACHE_nC:
+ return GIC_BASER_CACHE_RaWb;
+ default:
+ return field;
+ }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_SameAsInner:
+ case GIC_BASER_CACHE_nC:
+ return field;
+ default:
+ return GIC_BASER_CACHE_nC;
+ }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64))
+{
+ u64 field = (reg & field_mask) >> field_shift;
+
+ field = sanitise_fn(field) << field_shift;
+ return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK \
+ (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK \
+ (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \
+ GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+ GICR_PENDBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+ GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PENDBASER_RES0_MASK;
+
+ return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+ GICR_PROPBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+ GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PROPBASER_RES0_MASK;
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 old_propbaser, propbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ do {
+ old_propbaser = READ_ONCE(dist->propbaser);
+ propbaser = old_propbaser;
+ propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+ propbaser = vgic_sanitise_propbaser(propbaser);
+ } while (cmpxchg64(&dist->propbaser, old_propbaser,
+ propbaser) != old_propbaser);
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 value = vgic_cpu->pendbaser;
+
+ value &= ~GICR_PENDBASER_PTZ;
+
+ return extract_bytes(value, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 old_pendbaser, pendbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ do {
+ old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
+ pendbaser = old_pendbaser;
+ pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+ pendbaser = vgic_sanitise_pendbaser(pendbaser);
+ } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
+ pendbaser) != old_pendbaser);
+}
+
+/*
+ * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
+ * redistributors, while SPIs are covered by registers in the distributor
+ * block. Trying to set private IRQs in this block gets ignored.
+ * We take some special care here to fix the calculation of the register
+ * offset.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
+ { \
+ .reg_offset = off, \
+ .bits_per_irq = bpi, \
+ .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \
+ .access_flags = acc, \
+ .read = vgic_mmio_read_raz, \
+ .write = vgic_mmio_write_wi, \
+ }, { \
+ .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8, \
+ .bits_per_irq = bpi, \
+ .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8, \
+ .access_flags = acc, \
+ .read = rd, \
+ .write = wr, \
+ .uaccess_read = ur, \
+ .uaccess_write = uw, \
+ }
+
+static const struct vgic_register_region vgic_v3_dist_registers[] = {
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
+ vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
+ NULL, vgic_mmio_uaccess_write_v3_misc,
+ 16, VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
+ vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
+ vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
+ vgic_mmio_read_pending, vgic_mmio_write_spending,
+ vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
+ vgic_mmio_read_pending, vgic_mmio_write_cpending,
+ vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
+ vgic_mmio_read_active, vgic_mmio_write_sactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
+ vgic_mmio_read_active, vgic_mmio_write_cactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
+ 1, VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
+ vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
+ 8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
+ VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
+ vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
+ vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
+ vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+ VGIC_ACCESS_32bit),
+};
+
+static const struct vgic_register_region vgic_v3_rd_registers[] = {
+ /* RD_base registers */
+ REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
+ vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
+ vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
+ vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
+ vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
+ vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
+ vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
+ VGIC_ACCESS_32bit),
+ /* SGI_base registers */
+ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
+ vgic_mmio_read_group, vgic_mmio_write_group, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
+ vgic_mmio_read_enable, vgic_mmio_write_senable,
+ NULL, vgic_uaccess_write_senable, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
+ vgic_mmio_read_enable, vgic_mmio_write_cenable,
+ NULL, vgic_uaccess_write_cenable, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
+ vgic_mmio_read_pending, vgic_mmio_write_spending,
+ vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
+ vgic_mmio_read_pending, vgic_mmio_write_cpending,
+ vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
+ vgic_mmio_read_active, vgic_mmio_write_sactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
+ vgic_mmio_read_active, vgic_mmio_write_cactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
+ vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
+ VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
+ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
+ vgic_mmio_read_config, vgic_mmio_write_config, 8,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
+ vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+};
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
+{
+ dev->regions = vgic_v3_dist_registers;
+ dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+
+ kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+ return SZ_64K;
+}
+
+/**
+ * vgic_register_redist_iodev - register a single redist iodev
+ * @vcpu: The VCPU to which the redistributor belongs
+ *
+ * Register a KVM iodev for this VCPU's redistributor using the address
+ * provided.
+ *
+ * Return 0 on success, -ERRNO otherwise.
+ */
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct vgic_dist *vgic = &kvm->arch.vgic;
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+ struct vgic_redist_region *rdreg;
+ gpa_t rd_base;
+ int ret;
+
+ if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
+ return 0;
+
+ /*
+ * We may be creating VCPUs before having set the base address for the
+ * redistributor region, in which case we will come back to this
+ * function for all VCPUs when the base address is set. Just return
+ * without doing any work for now.
+ */
+ rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
+ if (!rdreg)
+ return 0;
+
+ if (!vgic_v3_check_base(kvm))
+ return -EINVAL;
+
+ vgic_cpu->rdreg = rdreg;
+
+ rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;
+
+ kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
+ rd_dev->base_addr = rd_base;
+ rd_dev->iodev_type = IODEV_REDIST;
+ rd_dev->regions = vgic_v3_rd_registers;
+ rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+ rd_dev->redist_vcpu = vcpu;
+
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
+ 2 * SZ_64K, &rd_dev->dev);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (ret)
+ return ret;
+
+ rdreg->free_index++;
+ return 0;
+}
+
+static void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
+{
+ struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+
+ kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
+}
+
+static int vgic_register_all_redist_iodevs(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int c, ret = 0;
+
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ ret = vgic_register_redist_iodev(vcpu);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ /* The current c failed, so we start with the previous one. */
+ mutex_lock(&kvm->slots_lock);
+ for (c--; c >= 0; c--) {
+ vcpu = kvm_get_vcpu(kvm, c);
+ vgic_unregister_redist_iodev(vcpu);
+ }
+ mutex_unlock(&kvm->slots_lock);
+ }
+
+ return ret;
+}
+
+/**
+ * vgic_v3_insert_redist_region - Insert a new redistributor region
+ *
+ * Performs various checks before inserting the rdist region in the list.
+ * Those tests depend on whether the size of the rdist region is known
+ * (ie. count != 0). The list is sorted by rdist region index.
+ *
+ * @kvm: kvm handle
+ * @index: redist region index
+ * @base: base of the new rdist region
+ * @count: number of redistributors the region is made of (0 in the old style
+ * single region, whose size is induced from the number of vcpus)
+ *
+ * Return 0 on success, < 0 otherwise
+ */
+static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index,
+ gpa_t base, uint32_t count)
+{
+ struct vgic_dist *d = &kvm->arch.vgic;
+ struct vgic_redist_region *rdreg;
+ struct list_head *rd_regions = &d->rd_regions;
+ size_t size = count * KVM_VGIC_V3_REDIST_SIZE;
+ int ret;
+
+ /* single rdist region already set ?*/
+ if (!count && !list_empty(rd_regions))
+ return -EINVAL;
+
+ /* cross the end of memory ? */
+ if (base + size < base)
+ return -EINVAL;
+
+ if (list_empty(rd_regions)) {
+ if (index != 0)
+ return -EINVAL;
+ } else {
+ rdreg = list_last_entry(rd_regions,
+ struct vgic_redist_region, list);
+ if (index != rdreg->index + 1)
+ return -EINVAL;
+
+ /* Cannot add an explicitly sized regions after legacy region */
+ if (!rdreg->count)
+ return -EINVAL;
+ }
+
+ /*
+ * For legacy single-region redistributor regions (!count),
+ * check that the redistributor region does not overlap with the
+ * distributor's address space.
+ */
+ if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+ vgic_dist_overlap(kvm, base, size))
+ return -EINVAL;
+
+ /* collision with any other rdist region? */
+ if (vgic_v3_rdist_overlap(kvm, base, size))
+ return -EINVAL;
+
+ rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL);
+ if (!rdreg)
+ return -ENOMEM;
+
+ rdreg->base = VGIC_ADDR_UNDEF;
+
+ ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K);
+ if (ret)
+ goto free;
+
+ rdreg->base = base;
+ rdreg->count = count;
+ rdreg->free_index = 0;
+ rdreg->index = index;
+
+ list_add_tail(&rdreg->list, rd_regions);
+ return 0;
+free:
+ kfree(rdreg);
+ return ret;
+}
+
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
+{
+ int ret;
+
+ ret = vgic_v3_insert_redist_region(kvm, index, addr, count);
+ if (ret)
+ return ret;
+
+ /*
+ * Register iodevs for each existing VCPU. Adding more VCPUs
+ * afterwards will register the iodevs when needed.
+ */
+ ret = vgic_register_all_redist_iodevs(kvm);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+ const struct vgic_register_region *region;
+ struct vgic_io_device iodev;
+ struct vgic_reg_attr reg_attr;
+ struct kvm_vcpu *vcpu;
+ gpa_t addr;
+ int ret;
+
+ ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
+ if (ret)
+ return ret;
+
+ vcpu = reg_attr.vcpu;
+ addr = reg_attr.addr;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
+ iodev.regions = vgic_v3_dist_registers;
+ iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
+ iodev.base_addr = 0;
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
+ iodev.regions = vgic_v3_rd_registers;
+ iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
+ iodev.base_addr = 0;
+ break;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: {
+ u64 reg, id;
+
+ id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK);
+ return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, &reg);
+ }
+ default:
+ return -ENXIO;
+ }
+
+ /* We only support aligned 32-bit accesses. */
+ if (addr & 3)
+ return -ENXIO;
+
+ region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
+ if (!region)
+ return -ENXIO;
+
+ return 0;
+}
+/*
+ * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
+ * generation register ICC_SGI1R_EL1) with a given VCPU.
+ * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
+ * return -1.
+ */
+static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
+{
+ unsigned long affinity;
+ int level0;
+
+ /*
+ * Split the current VCPU's MPIDR into affinity level 0 and the
+ * rest as this is what we have to compare against.
+ */
+ affinity = kvm_vcpu_get_mpidr_aff(vcpu);
+ level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
+ affinity &= ~MPIDR_LEVEL_MASK;
+
+ /* bail out if the upper three levels don't match */
+ if (sgi_aff != affinity)
+ return -1;
+
+ /* Is this VCPU's bit set in the mask ? */
+ if (!(sgi_cpu_mask & BIT(level0)))
+ return -1;
+
+ return level0;
+}
+
+/*
+ * The ICC_SGI* registers encode the affinity differently from the MPIDR,
+ * so provide a wrapper to use the existing defines to isolate a certain
+ * affinity level.
+ */
+#define SGI_AFFINITY_LEVEL(reg, level) \
+ ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
+ >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/**
+ * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
+ * @vcpu: The VCPU requesting a SGI
+ * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
+ * @allow_group1: Does the sysreg access allow generation of G1 SGIs
+ *
+ * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
+ * This will trap in sys_regs.c and call this function.
+ * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
+ * target processors as well as a bitmask of 16 Aff0 CPUs.
+ * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
+ * check for matching ones. If this bit is set, we signal all, but not the
+ * calling VCPU.
+ */
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu *c_vcpu;
+ u16 target_cpus;
+ u64 mpidr;
+ int sgi, c;
+ int vcpu_id = vcpu->vcpu_id;
+ bool broadcast;
+ unsigned long flags;
+
+ sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
+ broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
+ target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
+ mpidr = SGI_AFFINITY_LEVEL(reg, 3);
+ mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
+ mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
+
+ /*
+ * We iterate over all VCPUs to find the MPIDRs matching the request.
+ * If we have handled one CPU, we clear its bit to detect early
+ * if we are already finished. This avoids iterating through all
+ * VCPUs when most of the times we just signal a single VCPU.
+ */
+ kvm_for_each_vcpu(c, c_vcpu, kvm) {
+ struct vgic_irq *irq;
+
+ /* Exit early if we have dealt with all requested CPUs */
+ if (!broadcast && target_cpus == 0)
+ break;
+
+ /* Don't signal the calling VCPU */
+ if (broadcast && c == vcpu_id)
+ continue;
+
+ if (!broadcast) {
+ int level0;
+
+ level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
+ if (level0 == -1)
+ continue;
+
+ /* remove this matching VCPU from the mask */
+ target_cpus &= ~BIT(level0);
+ }
+
+ irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ /*
+ * An access targetting Group0 SGIs can only generate
+ * those, while an access targetting Group1 SGIs can
+ * generate interrupts of either group.
+ */
+ if (!irq->group || allow_group1) {
+ if (!irq->hw) {
+ irq->pending_latch = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ } else {
+ /* HW SGI? Ask the GIC to inject it */
+ int err;
+ err = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ true);
+ WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+ } else {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val)
+{
+ struct vgic_io_device dev = {
+ .regions = vgic_v3_dist_registers,
+ .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
+ };
+
+ return vgic_uaccess(vcpu, &dev, is_write, offset, val);
+}
+
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val)
+{
+ struct vgic_io_device rd_dev = {
+ .regions = vgic_v3_rd_registers,
+ .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
+ };
+
+ return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
+}
+
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ u32 intid, u64 *val)
+{
+ if (intid % 32)
+ return -EINVAL;
+
+ if (is_write)
+ vgic_write_irq_line_level_info(vcpu, intid, *val);
+ else
+ *val = vgic_read_irq_line_level_info(vcpu, intid);
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
new file mode 100644
index 000000000000..b2d73fc0d1ef
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VGIC MMIO handling functions
+ */
+
+#include <linux/bitops.h>
+#include <linux/bsearch.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/iodev.h>
+#include <kvm/arm_arch_timer.h>
+#include <kvm/arm_vgic.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return 0;
+}
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return -1UL;
+}
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val)
+{
+ /* Ignore */
+}
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val)
+{
+ /* Ignore */
+ return 0;
+}
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 value = 0;
+ int i;
+
+ /* Loop over all IRQs affected by this read */
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ if (irq->group)
+ value |= BIT(i);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+static void vgic_update_vsgi(struct vgic_irq *irq)
+{
+ WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
+}
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->group = !!(val & BIT(i));
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ vgic_update_vsgi(irq);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ } else {
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+/*
+ * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
+ * of the enabled bit, so there is only one function for both here.
+ */
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 value = 0;
+ int i;
+
+ /* Loop over all IRQs affected by this read */
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ if (irq->enabled)
+ value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ if (!irq->enabled) {
+ struct irq_data *data;
+
+ irq->enabled = true;
+ data = &irq_to_desc(irq->host_irq)->irq_data;
+ while (irqd_irq_disabled(data))
+ enable_irq(irq->host_irq);
+ }
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ continue;
+ } else if (vgic_irq_is_mapped_level(irq)) {
+ bool was_high = irq->line_level;
+
+ /*
+ * We need to update the state of the interrupt because
+ * the guest might have changed the state of the device
+ * while the interrupt was disabled at the VGIC level.
+ */
+ irq->line_level = vgic_get_phys_line_level(irq);
+ /*
+ * Deactivate the physical interrupt so the GIC will let
+ * us know when it is asserted again.
+ */
+ if (!irq->active && was_high && !irq->line_level)
+ vgic_irq_set_phys_active(irq, false);
+ }
+ irq->enabled = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
+ disable_irq_nosync(irq->host_irq);
+
+ irq->enabled = false;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->enabled = true;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->enabled = false;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 value = 0;
+ int i;
+
+ /* Loop over all IRQs affected by this read */
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ unsigned long flags;
+ bool val;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ int err;
+
+ val = false;
+ err = irq_get_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ &val);
+ WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+ } else {
+ val = irq_is_pending(irq);
+ }
+
+ value |= ((u32)val << i);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+ return (vgic_irq_is_sgi(irq->intid) &&
+ vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
+}
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ /* GICD_ISPENDR0 SGI bits are WI */
+ if (is_vgic_v2_sgi(vcpu, irq)) {
+ vgic_put_irq(vcpu->kvm, irq);
+ continue;
+ }
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ /* HW SGI? Ask the GIC to inject it */
+ int err;
+ err = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ true);
+ WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ continue;
+ }
+
+ irq->pending_latch = true;
+ if (irq->hw)
+ vgic_irq_set_phys_active(irq, true);
+
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->pending_latch = true;
+
+ /*
+ * GICv2 SGIs are terribly broken. We can't restore
+ * the source of the interrupt, so just pick the vcpu
+ * itself as the source...
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source |= BIT(vcpu->vcpu_id);
+
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
+{
+ irq->pending_latch = false;
+
+ /*
+ * We don't want the guest to effectively mask the physical
+ * interrupt by doing a write to SPENDR followed by a write to
+ * CPENDR for HW interrupts, so we clear the active state on
+ * the physical side if the virtual interrupt is not active.
+ * This may lead to taking an additional interrupt on the
+ * host, but that should not be a problem as the worst that
+ * can happen is an additional vgic injection. We also clear
+ * the pending state to maintain proper semantics for edge HW
+ * interrupts.
+ */
+ vgic_irq_set_phys_pending(irq, false);
+ if (!irq->active)
+ vgic_irq_set_phys_active(irq, false);
+}
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ /* GICD_ICPENDR0 SGI bits are WI */
+ if (is_vgic_v2_sgi(vcpu, irq)) {
+ vgic_put_irq(vcpu->kvm, irq);
+ continue;
+ }
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ /* HW SGI? Ask the GIC to clear its pending bit */
+ int err;
+ err = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ false);
+ WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ continue;
+ }
+
+ if (irq->hw)
+ vgic_hw_irq_cpending(vcpu, irq);
+ else
+ irq->pending_latch = false;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+ unsigned long flags;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ /*
+ * More fun with GICv2 SGIs! If we're clearing one of them
+ * from userspace, which source vcpu to clear? Let's not
+ * even think of it, and blow the whole set.
+ */
+ if (is_vgic_v2_sgi(vcpu, irq))
+ irq->source = 0;
+
+ irq->pending_latch = false;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return 0;
+}
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts as well as GICv3 private interrupts, we have to
+ * stop all the VCPUs because interrupts can be migrated while we don't hold
+ * the IRQ locks and we don't want to be chasing moving targets.
+ *
+ * For GICv2 private interrupts we don't have to do anything because
+ * userspace accesses to the VGIC state already require all VCPUs to be
+ * stopped, and only the VCPU itself can modify its private interrupts
+ * active state, which guarantees that the VCPU is not running.
+ */
+static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_access_active_prepare */
+static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_resume_guest(vcpu->kvm);
+}
+
+static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 value = 0;
+ int i;
+
+ /* Loop over all IRQs affected by this read */
+ for (i = 0; i < len * 8; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ /*
+ * Even for HW interrupts, don't evaluate the HW state as
+ * all the guest is interested in is the virtual state.
+ */
+ if (irq->active)
+ value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 val;
+
+ mutex_lock(&vcpu->kvm->lock);
+ vgic_access_active_prepare(vcpu, intid);
+
+ val = __vgic_mmio_read_active(vcpu, addr, len);
+
+ vgic_access_active_finish(vcpu, intid);
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return val;
+}
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __vgic_mmio_read_active(vcpu, addr, len);
+}
+
+/* Must be called with irq->irq_lock held */
+static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+ bool active, bool is_uaccess)
+{
+ if (is_uaccess)
+ return;
+
+ irq->active = active;
+ vgic_irq_set_phys_active(irq, active);
+}
+
+static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+ bool active)
+{
+ unsigned long flags;
+ struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
+ vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
+ } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
+ /*
+ * GICv4.1 VSGI feature doesn't track an active state,
+ * so let's not kid ourselves, there is nothing we can
+ * do here.
+ */
+ irq->active = false;
+ } else {
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u8 active_source;
+
+ irq->active = active;
+
+ /*
+ * The GICv2 architecture indicates that the source CPUID for
+ * an SGI should be provided during an EOI which implies that
+ * the active state is stored somewhere, but at the same time
+ * this state is not architecturally exposed anywhere and we
+ * have no way of knowing the right source.
+ *
+ * This may lead to a VCPU not being able to receive
+ * additional instances of a particular SGI after migration
+ * for a GICv2 VM on some GIC implementations. Oh well.
+ */
+ active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
+
+ if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+ active && vgic_irq_is_sgi(irq->intid))
+ irq->active_source = active_source;
+ }
+
+ if (irq->active)
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ else
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+}
+
+static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ vgic_mmio_change_active(vcpu, irq, false);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+ mutex_lock(&vcpu->kvm->lock);
+ vgic_access_active_prepare(vcpu, intid);
+
+ __vgic_mmio_write_cactive(vcpu, addr, len, val);
+
+ vgic_access_active_finish(vcpu, intid);
+ mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ __vgic_mmio_write_cactive(vcpu, addr, len, val);
+ return 0;
+}
+
+static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ int i;
+
+ for_each_set_bit(i, &val, len * 8) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ vgic_mmio_change_active(vcpu, irq, true);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+
+ mutex_lock(&vcpu->kvm->lock);
+ vgic_access_active_prepare(vcpu, intid);
+
+ __vgic_mmio_write_sactive(vcpu, addr, len, val);
+
+ vgic_access_active_finish(vcpu, intid);
+ mutex_unlock(&vcpu->kvm->lock);
+}
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ __vgic_mmio_write_sactive(vcpu, addr, len, val);
+ return 0;
+}
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+ int i;
+ u64 val = 0;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ val |= (u64)irq->priority << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return val;
+}
+
+/*
+ * We currently don't handle changing the priority of an interrupt that
+ * is already pending on a VCPU. If there is a need for this, we would
+ * need to make this VCPU exit and re-evaluate the priorities, potentially
+ * leading to this interrupt getting presented now to the guest (if it has
+ * been masked by the priority mask before).
+ */
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ /* Narrow the priority range to what we actually support */
+ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
+ if (irq->hw && vgic_irq_is_sgi(irq->intid))
+ vgic_update_vsgi(irq);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+ u32 value = 0;
+ int i;
+
+ for (i = 0; i < len * 4; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ if (irq->config == VGIC_CONFIG_EDGE)
+ value |= (2U << (i * 2));
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return value;
+}
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < len * 4; i++) {
+ struct vgic_irq *irq;
+
+ /*
+ * The configuration cannot be changed for SGIs in general,
+ * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
+ * code relies on PPIs being level triggered, so we also
+ * make them read-only here.
+ */
+ if (intid + i < VGIC_NR_PRIVATE_IRQS)
+ continue;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (test_bit(i * 2 + 1, &val))
+ irq->config = VGIC_CONFIG_EDGE;
+ else
+ irq->config = VGIC_CONFIG_LEVEL;
+
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
+{
+ int i;
+ u64 val = 0;
+ int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+ for (i = 0; i < 32; i++) {
+ struct vgic_irq *irq;
+
+ if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+ continue;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
+ val |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ return val;
+}
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+ const u64 val)
+{
+ int i;
+ int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+ unsigned long flags;
+
+ for (i = 0; i < 32; i++) {
+ struct vgic_irq *irq;
+ bool new_level;
+
+ if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
+ continue;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+
+ /*
+ * Line level is set irrespective of irq type
+ * (level or edge) to avoid dependency that VM should
+ * restore irq config before line level.
+ */
+ new_level = !!(val & (1U << i));
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->line_level = new_level;
+ if (new_level)
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+ else
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+static int match_region(const void *key, const void *elt)
+{
+ const unsigned int offset = (unsigned long)key;
+ const struct vgic_register_region *region = elt;
+
+ if (offset < region->reg_offset)
+ return -1;
+
+ if (offset >= region->reg_offset + region->len)
+ return 1;
+
+ return 0;
+}
+
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+ int nr_regions, unsigned int offset)
+{
+ return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
+ sizeof(regions[0]), match_region);
+}
+
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_set_vmcr(vcpu, vmcr);
+ else
+ vgic_v3_set_vmcr(vcpu, vmcr);
+}
+
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_get_vmcr(vcpu, vmcr);
+ else
+ vgic_v3_get_vmcr(vcpu, vmcr);
+}
+
+/*
+ * kvm_mmio_read_buf() returns a value in a format where it can be converted
+ * to a byte array and be directly observed as the guest wanted it to appear
+ * in memory if it had done the store itself, which is LE for the GIC, as the
+ * guest knows the GIC is always LE.
+ *
+ * We convert this value to the CPUs native format to deal with it as a data
+ * value.
+ */
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
+{
+ unsigned long data = kvm_mmio_read_buf(val, len);
+
+ switch (len) {
+ case 1:
+ return data;
+ case 2:
+ return le16_to_cpu(data);
+ case 4:
+ return le32_to_cpu(data);
+ default:
+ return le64_to_cpu(data);
+ }
+}
+
+/*
+ * kvm_mmio_write_buf() expects a value in a format such that if converted to
+ * a byte array it is observed as the guest would see it if it could perform
+ * the load directly. Since the GIC is LE, and the guest knows this, the
+ * guest expects a value in little endian format.
+ *
+ * We convert the data value from the CPUs native format to LE so that the
+ * value is returned in the proper format.
+ */
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+ unsigned long data)
+{
+ switch (len) {
+ case 1:
+ break;
+ case 2:
+ data = cpu_to_le16(data);
+ break;
+ case 4:
+ data = cpu_to_le32(data);
+ break;
+ default:
+ data = cpu_to_le64(data);
+ }
+
+ kvm_mmio_write_buf(buf, len, data);
+}
+
+static
+struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
+{
+ return container_of(dev, struct vgic_io_device, dev);
+}
+
+static bool check_region(const struct kvm *kvm,
+ const struct vgic_register_region *region,
+ gpa_t addr, int len)
+{
+ int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
+
+ switch (len) {
+ case sizeof(u8):
+ flags = VGIC_ACCESS_8bit;
+ break;
+ case sizeof(u32):
+ flags = VGIC_ACCESS_32bit;
+ break;
+ case sizeof(u64):
+ flags = VGIC_ACCESS_64bit;
+ break;
+ default:
+ return false;
+ }
+
+ if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
+ if (!region->bits_per_irq)
+ return true;
+
+ /* Do we access a non-allocated IRQ? */
+ return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
+ }
+
+ return false;
+}
+
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+ gpa_t addr, int len)
+{
+ const struct vgic_register_region *region;
+
+ region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
+ addr - iodev->base_addr);
+ if (!region || !check_region(vcpu->kvm, region, addr, len))
+ return NULL;
+
+ return region;
+}
+
+static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, u32 *val)
+{
+ struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+ const struct vgic_register_region *region;
+ struct kvm_vcpu *r_vcpu;
+
+ region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+ if (!region) {
+ *val = 0;
+ return 0;
+ }
+
+ r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+ if (region->uaccess_read)
+ *val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
+ else
+ *val = region->read(r_vcpu, addr, sizeof(u32));
+
+ return 0;
+}
+
+static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, const u32 *val)
+{
+ struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+ const struct vgic_register_region *region;
+ struct kvm_vcpu *r_vcpu;
+
+ region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
+ if (!region)
+ return 0;
+
+ r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
+ if (region->uaccess_write)
+ return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);
+
+ region->write(r_vcpu, addr, sizeof(u32), *val);
+ return 0;
+}
+
+/*
+ * Userland access to VGIC registers.
+ */
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+ bool is_write, int offset, u32 *val)
+{
+ if (is_write)
+ return vgic_uaccess_write(vcpu, &dev->dev, offset, val);
+ else
+ return vgic_uaccess_read(vcpu, &dev->dev, offset, val);
+}
+
+static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+ const struct vgic_register_region *region;
+ unsigned long data = 0;
+
+ region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+ if (!region) {
+ memset(val, 0, len);
+ return 0;
+ }
+
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_DIST:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_REDIST:
+ data = region->read(iodev->redist_vcpu, addr, len);
+ break;
+ case IODEV_ITS:
+ data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+ break;
+ }
+
+ vgic_data_host_to_mmio_bus(val, len, data);
+ return 0;
+}
+
+static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
+ const struct vgic_register_region *region;
+ unsigned long data = vgic_data_mmio_bus_to_host(val, len);
+
+ region = vgic_get_mmio_region(vcpu, iodev, addr, len);
+ if (!region)
+ return 0;
+
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_DIST:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_REDIST:
+ region->write(iodev->redist_vcpu, addr, len, data);
+ break;
+ case IODEV_ITS:
+ region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+ break;
+ }
+
+ return 0;
+}
+
+struct kvm_io_device_ops kvm_io_gic_ops = {
+ .read = dispatch_mmio_read,
+ .write = dispatch_mmio_write,
+};
+
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+ enum vgic_type type)
+{
+ struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
+ int ret = 0;
+ unsigned int len;
+
+ switch (type) {
+ case VGIC_V2:
+ len = vgic_v2_init_dist_iodev(io_device);
+ break;
+ case VGIC_V3:
+ len = vgic_v3_init_dist_iodev(io_device);
+ break;
+ default:
+ BUG_ON(1);
+ }
+
+ io_device->base_addr = dist_base_address;
+ io_device->iodev_type = IODEV_DIST;
+ io_device->redist_vcpu = NULL;
+
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
+ len, &io_device->dev);
+ mutex_unlock(&kvm->slots_lock);
+
+ return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
new file mode 100644
index 000000000000..fefcca2b14dc
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_MMIO_H__
+#define __KVM_ARM_VGIC_MMIO_H__
+
+struct vgic_register_region {
+ unsigned int reg_offset;
+ unsigned int len;
+ unsigned int bits_per_irq;
+ unsigned int access_flags;
+ union {
+ unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len);
+ unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len);
+ };
+ union {
+ void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+ void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+ };
+ unsigned long (*uaccess_read)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len);
+ union {
+ int (*uaccess_write)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+ int (*uaccess_its_write)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+ };
+};
+
+extern struct kvm_io_device_ops kvm_io_gic_ops;
+
+#define VGIC_ACCESS_8bit 1
+#define VGIC_ACCESS_32bit 2
+#define VGIC_ACCESS_64bit 4
+
+/*
+ * Generate a mask that covers the number of bytes required to address
+ * up to 1024 interrupts, each represented by <bits> bits. This assumes
+ * that <bits> is a power of two.
+ */
+#define VGIC_ADDR_IRQ_MASK(bits) (((bits) * 1024 / 8) - 1)
+
+/*
+ * (addr & mask) gives us the _byte_ offset for the INT ID.
+ * We multiply this by 8 the get the _bit_ offset, then divide this by
+ * the number of bits to learn the actual INT ID.
+ * But instead of a division (which requires a "long long div" implementation),
+ * we shift by the binary logarithm of <bits>.
+ * This assumes that <bits> is a power of two.
+ */
+#define VGIC_ADDR_TO_INTID(addr, bits) (((addr) & VGIC_ADDR_IRQ_MASK(bits)) * \
+ 8 >> ilog2(bits))
+
+/*
+ * Some VGIC registers store per-IRQ information, with a different number
+ * of bits per IRQ. For those registers this macro is used.
+ * The _WITH_LENGTH version instantiates registers with a fixed length
+ * and is mutually exclusive with the _PER_IRQ version.
+ */
+#define REGISTER_DESC_WITH_BITS_PER_IRQ(off, rd, wr, ur, uw, bpi, acc) \
+ { \
+ .reg_offset = off, \
+ .bits_per_irq = bpi, \
+ .len = bpi * 1024 / 8, \
+ .access_flags = acc, \
+ .read = rd, \
+ .write = wr, \
+ .uaccess_read = ur, \
+ .uaccess_write = uw, \
+ }
+
+#define REGISTER_DESC_WITH_LENGTH(off, rd, wr, length, acc) \
+ { \
+ .reg_offset = off, \
+ .bits_per_irq = 0, \
+ .len = length, \
+ .access_flags = acc, \
+ .read = rd, \
+ .write = wr, \
+ }
+
+#define REGISTER_DESC_WITH_LENGTH_UACCESS(off, rd, wr, urd, uwr, length, acc) \
+ { \
+ .reg_offset = off, \
+ .bits_per_irq = 0, \
+ .len = length, \
+ .access_flags = acc, \
+ .read = rd, \
+ .write = wr, \
+ .uaccess_read = urd, \
+ .uaccess_write = uwr, \
+ }
+
+unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
+
+void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
+ unsigned long data);
+
+unsigned long extract_bytes(u64 data, unsigned int offset,
+ unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val);
+
+unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+
+int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len);
+
+void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+
+unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
+void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+
+int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
+ bool is_write, int offset, u32 *val);
+
+u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid);
+
+void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
+ const u64 val);
+
+unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+
+unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64));
+
+/* Find the proper register handler entry given a certain address offset */
+const struct vgic_register_region *
+vgic_find_mmio_region(const struct vgic_register_region *regions,
+ int nr_regions, unsigned int offset);
+
+#endif
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
new file mode 100644
index 000000000000..ebf53a4e1296
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/irqchip/arm-gic.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+
+static inline void vgic_v2_write_lr(int lr, u32 val)
+{
+ void __iomem *base = kvm_vgic_global_state.vctrl_base;
+
+ writel_relaxed(val, base + GICH_LR0 + (lr * 4));
+}
+
+void vgic_v2_init_lrs(void)
+{
+ int i;
+
+ for (i = 0; i < kvm_vgic_global_state.nr_lr; i++)
+ vgic_v2_write_lr(i, 0);
+}
+
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
+
+ cpuif->vgic_hcr |= GICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u32 lr_val)
+{
+ return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) &&
+ !(lr_val & GICH_LR_HW);
+}
+
+/*
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ * - transferred as is in case of edge sensitive IRQs
+ * - set to the line-level (resample time) for level sensitive IRQs
+ */
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ int lr;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ cpuif->vgic_hcr &= ~GICH_HCR_UIE;
+
+ for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) {
+ u32 val = cpuif->vgic_lr[lr];
+ u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+ struct vgic_irq *irq;
+
+ /* Extract the source vCPU id from the LR */
+ cpuid = val & GICH_LR_PHYSID_CPUID;
+ cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+ cpuid &= 7;
+
+ /* Notify fds when the guest EOI'ed a level-triggered SPI */
+ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+
+ raw_spin_lock(&irq->irq_lock);
+
+ /* Always preserve the active bit */
+ irq->active = !!(val & GICH_LR_ACTIVE_BIT);
+
+ if (irq->active && vgic_irq_is_sgi(intid))
+ irq->active_source = cpuid;
+
+ /* Edge is the only case where we preserve the pending bit */
+ if (irq->config == VGIC_CONFIG_EDGE &&
+ (val & GICH_LR_PENDING_BIT)) {
+ irq->pending_latch = true;
+
+ if (vgic_irq_is_sgi(intid))
+ irq->source |= (1 << cpuid);
+ }
+
+ /*
+ * Clear soft pending state when level irqs have been acked.
+ */
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE))
+ irq->pending_latch = false;
+
+ /*
+ * Level-triggered mapped IRQs are special because we only
+ * observe rising edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample
+ * the physical line and set the line level, because the
+ * device state could have changed or we simply need to
+ * process the still pending interrupt later.
+ *
+ * If this causes us to lower the level, we have to also clear
+ * the physical active state, since we will otherwise never be
+ * told when the interrupt becomes asserted again.
+ */
+ if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+
+ if (!irq->line_level)
+ vgic_irq_set_phys_active(irq, false);
+ }
+
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ cpuif->used_lrs = 0;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ * it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 val = irq->intid;
+ bool allow_pending = true;
+
+ if (irq->active) {
+ val |= GICH_LR_ACTIVE_BIT;
+ if (vgic_irq_is_sgi(irq->intid))
+ val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+ if (vgic_irq_is_multi_sgi(irq)) {
+ allow_pending = false;
+ val |= GICH_LR_EOI;
+ }
+ }
+
+ if (irq->group)
+ val |= GICH_LR_GROUP1;
+
+ if (irq->hw) {
+ val |= GICH_LR_HW;
+ val |= irq->hwintid << GICH_LR_PHYSID_CPUID_SHIFT;
+ /*
+ * Never set pending+active on a HW interrupt, as the
+ * pending state is kept at the physical distributor
+ * level.
+ */
+ if (irq->active)
+ allow_pending = false;
+ } else {
+ if (irq->config == VGIC_CONFIG_LEVEL) {
+ val |= GICH_LR_EOI;
+
+ /*
+ * Software resampling doesn't work very well
+ * if we allow P+A, so let's not do that.
+ */
+ if (irq->active)
+ allow_pending = false;
+ }
+ }
+
+ if (allow_pending && irq_is_pending(irq)) {
+ val |= GICH_LR_PENDING_BIT;
+
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+
+ if (vgic_irq_is_sgi(irq->intid)) {
+ u32 src = ffs(irq->source);
+
+ if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+ irq->intid))
+ return;
+
+ val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+ irq->source &= ~(1 << (src - 1));
+ if (irq->source) {
+ irq->pending_latch = true;
+ val |= GICH_LR_EOI;
+ }
+ }
+ }
+
+ /*
+ * Level-triggered mapped IRQs are special because we only observe
+ * rising edges as input to the VGIC. We therefore lower the line
+ * level here, so that we can take new virtual IRQs. See
+ * vgic_v2_fold_lr_state for more info.
+ */
+ if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT))
+ irq->line_level = false;
+
+ /* The GICv2 LR only holds five bits of priority. */
+ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+}
+
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = 0;
+}
+
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ u32 vmcr;
+
+ vmcr = (vmcrp->grpen0 << GICH_VMCR_ENABLE_GRP0_SHIFT) &
+ GICH_VMCR_ENABLE_GRP0_MASK;
+ vmcr |= (vmcrp->grpen1 << GICH_VMCR_ENABLE_GRP1_SHIFT) &
+ GICH_VMCR_ENABLE_GRP1_MASK;
+ vmcr |= (vmcrp->ackctl << GICH_VMCR_ACK_CTL_SHIFT) &
+ GICH_VMCR_ACK_CTL_MASK;
+ vmcr |= (vmcrp->fiqen << GICH_VMCR_FIQ_EN_SHIFT) &
+ GICH_VMCR_FIQ_EN_MASK;
+ vmcr |= (vmcrp->cbpr << GICH_VMCR_CBPR_SHIFT) &
+ GICH_VMCR_CBPR_MASK;
+ vmcr |= (vmcrp->eoim << GICH_VMCR_EOI_MODE_SHIFT) &
+ GICH_VMCR_EOI_MODE_MASK;
+ vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) &
+ GICH_VMCR_ALIAS_BINPOINT_MASK;
+ vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
+ GICH_VMCR_BINPOINT_MASK;
+ vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+ GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
+
+ cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ u32 vmcr;
+
+ vmcr = cpu_if->vgic_vmcr;
+
+ vmcrp->grpen0 = (vmcr & GICH_VMCR_ENABLE_GRP0_MASK) >>
+ GICH_VMCR_ENABLE_GRP0_SHIFT;
+ vmcrp->grpen1 = (vmcr & GICH_VMCR_ENABLE_GRP1_MASK) >>
+ GICH_VMCR_ENABLE_GRP1_SHIFT;
+ vmcrp->ackctl = (vmcr & GICH_VMCR_ACK_CTL_MASK) >>
+ GICH_VMCR_ACK_CTL_SHIFT;
+ vmcrp->fiqen = (vmcr & GICH_VMCR_FIQ_EN_MASK) >>
+ GICH_VMCR_FIQ_EN_SHIFT;
+ vmcrp->cbpr = (vmcr & GICH_VMCR_CBPR_MASK) >>
+ GICH_VMCR_CBPR_SHIFT;
+ vmcrp->eoim = (vmcr & GICH_VMCR_EOI_MODE_MASK) >>
+ GICH_VMCR_EOI_MODE_SHIFT;
+
+ vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >>
+ GICH_VMCR_ALIAS_BINPOINT_SHIFT;
+ vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
+ GICH_VMCR_BINPOINT_SHIFT;
+ vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+ GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
+}
+
+void vgic_v2_enable(struct kvm_vcpu *vcpu)
+{
+ /*
+ * By forcing VMCR to zero, the GIC will restore the binary
+ * points to their reset values. Anything else resets to zero
+ * anyway.
+ */
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+
+ /* Get the show on the road... */
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
+}
+
+/* check for overlapping regions and for regions crossing the end of memory */
+static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
+{
+ if (dist_base + KVM_VGIC_V2_DIST_SIZE < dist_base)
+ return false;
+ if (cpu_base + KVM_VGIC_V2_CPU_SIZE < cpu_base)
+ return false;
+
+ if (dist_base + KVM_VGIC_V2_DIST_SIZE <= cpu_base)
+ return true;
+ if (cpu_base + KVM_VGIC_V2_CPU_SIZE <= dist_base)
+ return true;
+
+ return false;
+}
+
+int vgic_v2_map_resources(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ int ret = 0;
+
+ if (vgic_ready(kvm))
+ goto out;
+
+ if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
+ IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
+ kvm_err("Need to set vgic cpu and dist addresses first\n");
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) {
+ kvm_err("VGIC CPU and dist frames overlap\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Initialize the vgic if this hasn't already been done on demand by
+ * accessing the vgic state from userspace.
+ */
+ ret = vgic_init(kvm);
+ if (ret) {
+ kvm_err("Unable to initialize VGIC dynamic data structures\n");
+ goto out;
+ }
+
+ ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2);
+ if (ret) {
+ kvm_err("Unable to register VGIC MMIO regions\n");
+ goto out;
+ }
+
+ if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
+ ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
+ kvm_vgic_global_state.vcpu_base,
+ KVM_VGIC_V2_CPU_SIZE, true);
+ if (ret) {
+ kvm_err("Unable to remap VGIC CPU to VCPU\n");
+ goto out;
+ }
+ }
+
+ dist->ready = true;
+
+out:
+ return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap);
+
+/**
+ * vgic_v2_probe - probe for a VGICv2 compatible interrupt controller
+ * @info: pointer to the GIC description
+ *
+ * Returns 0 if the VGICv2 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v2_probe(const struct gic_kvm_info *info)
+{
+ int ret;
+ u32 vtr;
+
+ if (!info->vctrl.start) {
+ kvm_err("GICH not present in the firmware table\n");
+ return -ENXIO;
+ }
+
+ if (!PAGE_ALIGNED(info->vcpu.start) ||
+ !PAGE_ALIGNED(resource_size(&info->vcpu))) {
+ kvm_info("GICV region size/alignment is unsafe, using trapping (reduced performance)\n");
+
+ ret = create_hyp_io_mappings(info->vcpu.start,
+ resource_size(&info->vcpu),
+ &kvm_vgic_global_state.vcpu_base_va,
+ &kvm_vgic_global_state.vcpu_hyp_va);
+ if (ret) {
+ kvm_err("Cannot map GICV into hyp\n");
+ goto out;
+ }
+
+ static_branch_enable(&vgic_v2_cpuif_trap);
+ }
+
+ ret = create_hyp_io_mappings(info->vctrl.start,
+ resource_size(&info->vctrl),
+ &kvm_vgic_global_state.vctrl_base,
+ &kvm_vgic_global_state.vctrl_hyp);
+ if (ret) {
+ kvm_err("Cannot map VCTRL into hyp\n");
+ goto out;
+ }
+
+ vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
+ kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device\n");
+ goto out;
+ }
+
+ kvm_vgic_global_state.can_emulate_gicv2 = true;
+ kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+ kvm_vgic_global_state.type = VGIC_V2;
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
+
+ kvm_debug("vgic-v2@%llx\n", info->vctrl.start);
+
+ return 0;
+out:
+ if (kvm_vgic_global_state.vctrl_base)
+ iounmap(kvm_vgic_global_state.vctrl_base);
+ if (kvm_vgic_global_state.vcpu_base_va)
+ iounmap(kvm_vgic_global_state.vcpu_base_va);
+
+ return ret;
+}
+
+static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ u64 used_lrs = cpu_if->used_lrs;
+ u64 elrsr;
+ int i;
+
+ elrsr = readl_relaxed(base + GICH_ELRSR0);
+ if (unlikely(used_lrs > 32))
+ elrsr |= ((u64)readl_relaxed(base + GICH_ELRSR1)) << 32;
+
+ for (i = 0; i < used_lrs; i++) {
+ if (elrsr & (1UL << i))
+ cpu_if->vgic_lr[i] &= ~GICH_LR_STATE;
+ else
+ cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
+
+ writel_relaxed(0, base + GICH_LR0 + (i * 4));
+ }
+}
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu)
+{
+ void __iomem *base = kvm_vgic_global_state.vctrl_base;
+ u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
+
+ if (!base)
+ return;
+
+ if (used_lrs) {
+ save_lrs(vcpu, base);
+ writel_relaxed(0, base + GICH_HCR);
+ }
+}
+
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+ void __iomem *base = kvm_vgic_global_state.vctrl_base;
+ u64 used_lrs = cpu_if->used_lrs;
+ int i;
+
+ if (!base)
+ return;
+
+ if (used_lrs) {
+ writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+ for (i = 0; i < used_lrs; i++) {
+ writel_relaxed(cpu_if->vgic_lr[i],
+ base + GICH_LR0 + (i * 4));
+ }
+ }
+}
+
+void vgic_v2_load(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+ writel_relaxed(cpu_if->vgic_vmcr,
+ kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+ writel_relaxed(cpu_if->vgic_apr,
+ kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
+
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+}
+
+void vgic_v2_put(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
+
+ vgic_v2_vmcr_sync(vcpu);
+ cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
new file mode 100644
index 000000000000..76e2d85789ed
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -0,0 +1,693 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <kvm/arm_vgic.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_asm.h>
+
+#include "vgic.h"
+
+static bool group0_trap;
+static bool group1_trap;
+static bool common_trap;
+static bool gicv4_enable;
+
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ cpuif->vgic_hcr |= ICH_HCR_UIE;
+}
+
+static bool lr_signals_eoi_mi(u64 lr_val)
+{
+ return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
+ !(lr_val & ICH_LR_HW);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ int lr;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ cpuif->vgic_hcr &= ~ICH_HCR_UIE;
+
+ for (lr = 0; lr < cpuif->used_lrs; lr++) {
+ u64 val = cpuif->vgic_lr[lr];
+ u32 intid, cpuid;
+ struct vgic_irq *irq;
+ bool is_v2_sgi = false;
+
+ cpuid = val & GICH_LR_PHYSID_CPUID;
+ cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+
+ if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ } else {
+ intid = val & GICH_LR_VIRTUALID;
+ is_v2_sgi = vgic_irq_is_sgi(intid);
+ }
+
+ /* Notify fds when the guest EOI'ed a level-triggered IRQ */
+ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ continue;
+
+ raw_spin_lock(&irq->irq_lock);
+
+ /* Always preserve the active bit */
+ irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+ if (irq->active && is_v2_sgi)
+ irq->active_source = cpuid;
+
+ /* Edge is the only case where we preserve the pending bit */
+ if (irq->config == VGIC_CONFIG_EDGE &&
+ (val & ICH_LR_PENDING_BIT)) {
+ irq->pending_latch = true;
+
+ if (is_v2_sgi)
+ irq->source |= (1 << cpuid);
+ }
+
+ /*
+ * Clear soft pending state when level irqs have been acked.
+ */
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+ irq->pending_latch = false;
+
+ /*
+ * Level-triggered mapped IRQs are special because we only
+ * observe rising edges as input to the VGIC.
+ *
+ * If the guest never acked the interrupt we have to sample
+ * the physical line and set the line level, because the
+ * device state could have changed or we simply need to
+ * process the still pending interrupt later.
+ *
+ * If this causes us to lower the level, we have to also clear
+ * the physical active state, since we will otherwise never be
+ * told when the interrupt becomes asserted again.
+ */
+ if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) {
+ irq->line_level = vgic_get_phys_line_level(irq);
+
+ if (!irq->line_level)
+ vgic_irq_set_phys_active(irq, false);
+ }
+
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ cpuif->used_lrs = 0;
+}
+
+/* Requires the irq to be locked already */
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u64 val = irq->intid;
+ bool allow_pending = true, is_v2_sgi;
+
+ is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
+ model == KVM_DEV_TYPE_ARM_VGIC_V2);
+
+ if (irq->active) {
+ val |= ICH_LR_ACTIVE_BIT;
+ if (is_v2_sgi)
+ val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
+ if (vgic_irq_is_multi_sgi(irq)) {
+ allow_pending = false;
+ val |= ICH_LR_EOI;
+ }
+ }
+
+ if (irq->hw) {
+ val |= ICH_LR_HW;
+ val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
+ /*
+ * Never set pending+active on a HW interrupt, as the
+ * pending state is kept at the physical distributor
+ * level.
+ */
+ if (irq->active)
+ allow_pending = false;
+ } else {
+ if (irq->config == VGIC_CONFIG_LEVEL) {
+ val |= ICH_LR_EOI;
+
+ /*
+ * Software resampling doesn't work very well
+ * if we allow P+A, so let's not do that.
+ */
+ if (irq->active)
+ allow_pending = false;
+ }
+ }
+
+ if (allow_pending && irq_is_pending(irq)) {
+ val |= ICH_LR_PENDING_BIT;
+
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+
+ if (vgic_irq_is_sgi(irq->intid) &&
+ model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ u32 src = ffs(irq->source);
+
+ if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+ irq->intid))
+ return;
+
+ val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+ irq->source &= ~(1 << (src - 1));
+ if (irq->source) {
+ irq->pending_latch = true;
+ val |= ICH_LR_EOI;
+ }
+ }
+ }
+
+ /*
+ * Level-triggered mapped IRQs are special because we only observe
+ * rising edges as input to the VGIC. We therefore lower the line
+ * level here, so that we can take new virtual IRQs. See
+ * vgic_v3_fold_lr_state for more info.
+ */
+ if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
+ irq->line_level = false;
+
+ if (irq->group)
+ val |= ICH_LR_GROUP;
+
+ val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+ vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+}
+
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+ vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
+}
+
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u32 vmcr;
+
+ if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
+ ICH_VMCR_ACK_CTL_MASK;
+ vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
+ ICH_VMCR_FIQ_EN_MASK;
+ } else {
+ /*
+ * When emulating GICv3 on GICv3 with SRE=1 on the
+ * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+ */
+ vmcr = ICH_VMCR_FIQ_EN_MASK;
+ }
+
+ vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
+ vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
+ vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
+ vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
+ vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
+ vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
+ vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;
+
+ cpu_if->vgic_vmcr = vmcr;
+}
+
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u32 vmcr;
+
+ vmcr = cpu_if->vgic_vmcr;
+
+ if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
+ ICH_VMCR_ACK_CTL_SHIFT;
+ vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
+ ICH_VMCR_FIQ_EN_SHIFT;
+ } else {
+ /*
+ * When emulating GICv3 on GICv3 with SRE=1 on the
+ * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
+ */
+ vmcrp->fiqen = 1;
+ vmcrp->ackctl = 0;
+ }
+
+ vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
+ vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
+ vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
+ vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
+ vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
+ vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
+ vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
+}
+
+#define INITIAL_PENDBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
+void vgic_v3_enable(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /*
+ * By forcing VMCR to zero, the GIC will restore the binary
+ * points to their reset values. Anything else resets to zero
+ * anyway.
+ */
+ vgic_v3->vgic_vmcr = 0;
+
+ /*
+ * If we are emulating a GICv3, we do it in an non-GICv2-compatible
+ * way, so we force SRE to 1 to demonstrate this to the guest.
+ * Also, we don't support any form of IRQ/FIQ bypass.
+ * This goes with the spec allowing the value to be RAO/WI.
+ */
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
+ ICC_SRE_EL1_DFB |
+ ICC_SRE_EL1_SRE);
+ vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+ } else {
+ vgic_v3->vgic_sre = 0;
+ }
+
+ vcpu->arch.vgic_cpu.num_id_bits = (kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_ID_BITS_MASK) >>
+ ICH_VTR_ID_BITS_SHIFT;
+ vcpu->arch.vgic_cpu.num_pri_bits = ((kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_PRI_BITS_MASK) >>
+ ICH_VTR_PRI_BITS_SHIFT) + 1;
+
+ /* Get the show on the road... */
+ vgic_v3->vgic_hcr = ICH_HCR_EN;
+ if (group0_trap)
+ vgic_v3->vgic_hcr |= ICH_HCR_TALL0;
+ if (group1_trap)
+ vgic_v3->vgic_hcr |= ICH_HCR_TALL1;
+ if (common_trap)
+ vgic_v3->vgic_hcr |= ICH_HCR_TC;
+}
+
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
+{
+ struct kvm_vcpu *vcpu;
+ int byte_offset, bit_nr;
+ gpa_t pendbase, ptr;
+ bool status;
+ u8 val;
+ int ret;
+ unsigned long flags;
+
+retry:
+ vcpu = irq->target_vcpu;
+ if (!vcpu)
+ return 0;
+
+ pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+ byte_offset = irq->intid / BITS_PER_BYTE;
+ bit_nr = irq->intid % BITS_PER_BYTE;
+ ptr = pendbase + byte_offset;
+
+ ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+ if (ret)
+ return ret;
+
+ status = val & (1 << bit_nr);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->target_vcpu != vcpu) {
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ goto retry;
+ }
+ irq->pending_latch = status;
+ vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
+
+ if (status) {
+ /* clear consumed data */
+ val &= ~(1 << bit_nr);
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
+ * kvm lock and all vcpu lock must be held
+ */
+int vgic_v3_save_pending_tables(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ gpa_t last_ptr = ~(gpa_t)0;
+ int ret;
+ u8 val;
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ int byte_offset, bit_nr;
+ struct kvm_vcpu *vcpu;
+ gpa_t pendbase, ptr;
+ bool stored;
+
+ vcpu = irq->target_vcpu;
+ if (!vcpu)
+ continue;
+
+ pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+ byte_offset = irq->intid / BITS_PER_BYTE;
+ bit_nr = irq->intid % BITS_PER_BYTE;
+ ptr = pendbase + byte_offset;
+
+ if (ptr != last_ptr) {
+ ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
+ if (ret)
+ return ret;
+ last_ptr = ptr;
+ }
+
+ stored = val & (1U << bit_nr);
+ if (stored == irq->pending_latch)
+ continue;
+
+ if (irq->pending_latch)
+ val |= 1 << bit_nr;
+ else
+ val &= ~(1 << bit_nr);
+
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * vgic_v3_rdist_overlap - check if a region overlaps with any
+ * existing redistributor region
+ *
+ * @kvm: kvm handle
+ * @base: base of the region
+ * @size: size of region
+ *
+ * Return: true if there is an overlap
+ */
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+ struct vgic_dist *d = &kvm->arch.vgic;
+ struct vgic_redist_region *rdreg;
+
+ list_for_each_entry(rdreg, &d->rd_regions, list) {
+ if ((base + size > rdreg->base) &&
+ (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Check for overlapping regions and for regions crossing the end of memory
+ * for base addresses which have already been set.
+ */
+bool vgic_v3_check_base(struct kvm *kvm)
+{
+ struct vgic_dist *d = &kvm->arch.vgic;
+ struct vgic_redist_region *rdreg;
+
+ if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
+ d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
+ return false;
+
+ list_for_each_entry(rdreg, &d->rd_regions, list) {
+ if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) <
+ rdreg->base)
+ return false;
+ }
+
+ if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
+ return true;
+
+ return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
+ KVM_VGIC_V3_DIST_SIZE);
+}
+
+/**
+ * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
+ * which has free space to put a new rdist region.
+ *
+ * @rd_regions: redistributor region list head
+ *
+ * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
+ * Stride between redistributors is 0 and regions are filled in the index order.
+ *
+ * Return: the redist region handle, if any, that has space to map a new rdist
+ * region.
+ */
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
+{
+ struct vgic_redist_region *rdreg;
+
+ list_for_each_entry(rdreg, rd_regions, list) {
+ if (!vgic_v3_redist_region_full(rdreg))
+ return rdreg;
+ }
+ return NULL;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+ u32 index)
+{
+ struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
+ struct vgic_redist_region *rdreg;
+
+ list_for_each_entry(rdreg, rd_regions, list) {
+ if (rdreg->index == index)
+ return rdreg;
+ }
+ return NULL;
+}
+
+
+int vgic_v3_map_resources(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct kvm_vcpu *vcpu;
+ int ret = 0;
+ int c;
+
+ if (vgic_ready(kvm))
+ goto out;
+
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
+ kvm_debug("vcpu %d redistributor base not set\n", c);
+ ret = -ENXIO;
+ goto out;
+ }
+ }
+
+ if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
+ kvm_err("Need to set vgic distributor addresses first\n");
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (!vgic_v3_check_base(kvm)) {
+ kvm_err("VGIC redist and dist frames overlap\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * For a VGICv3 we require the userland to explicitly initialize
+ * the VGIC before we need to use it.
+ */
+ if (!vgic_initialized(kvm)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3);
+ if (ret) {
+ kvm_err("Unable to register VGICv3 dist MMIO regions\n");
+ goto out;
+ }
+
+ if (kvm_vgic_global_state.has_gicv4_1)
+ vgic_v4_configure_vsgis(kvm);
+ dist->ready = true;
+
+out:
+ return ret;
+}
+
+DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
+
+static int __init early_group0_trap_cfg(char *buf)
+{
+ return strtobool(buf, &group0_trap);
+}
+early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
+
+static int __init early_group1_trap_cfg(char *buf)
+{
+ return strtobool(buf, &group1_trap);
+}
+early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
+
+static int __init early_common_trap_cfg(char *buf)
+{
+ return strtobool(buf, &common_trap);
+}
+early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
+
+static int __init early_gicv4_enable(char *buf)
+{
+ return strtobool(buf, &gicv4_enable);
+}
+early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
+
+/**
+ * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
+ * @info: pointer to the GIC description
+ *
+ * Returns 0 if the VGICv3 has been probed successfully, returns an error code
+ * otherwise
+ */
+int vgic_v3_probe(const struct gic_kvm_info *info)
+{
+ u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
+ int ret;
+
+ /*
+ * The ListRegs field is 5 bits, but there is an architectural
+ * maximum of 16 list registers. Just ignore bit 4...
+ */
+ kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
+ kvm_vgic_global_state.can_emulate_gicv2 = false;
+ kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;
+
+ /* GICv4 support? */
+ if (info->has_v4) {
+ kvm_vgic_global_state.has_gicv4 = gicv4_enable;
+ kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
+ kvm_info("GICv4%s support %sabled\n",
+ kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
+ gicv4_enable ? "en" : "dis");
+ }
+
+ if (!info->vcpu.start) {
+ kvm_info("GICv3: no GICV resource entry\n");
+ kvm_vgic_global_state.vcpu_base = 0;
+ } else if (!PAGE_ALIGNED(info->vcpu.start)) {
+ pr_warn("GICV physical address 0x%llx not page aligned\n",
+ (unsigned long long)info->vcpu.start);
+ kvm_vgic_global_state.vcpu_base = 0;
+ } else {
+ kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+ kvm_vgic_global_state.can_emulate_gicv2 = true;
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device.\n");
+ return ret;
+ }
+ kvm_info("vgic-v2@%llx\n", info->vcpu.start);
+ }
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3 KVM device.\n");
+ kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+ return ret;
+ }
+
+ if (kvm_vgic_global_state.vcpu_base == 0)
+ kvm_info("disabling GICv2 emulation\n");
+
+ if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+ group0_trap = true;
+ group1_trap = true;
+ }
+
+ if (group0_trap || group1_trap || common_trap) {
+ kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n",
+ group0_trap ? "G0" : "",
+ group1_trap ? "G1" : "",
+ common_trap ? "C" : "");
+ static_branch_enable(&vgic_v3_cpuif_trap);
+ }
+
+ kvm_vgic_global_state.vctrl_base = NULL;
+ kvm_vgic_global_state.type = VGIC_V3;
+ kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;
+
+ return 0;
+}
+
+void vgic_v3_load(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /*
+ * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+ * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+ * VMCR_EL2 save/restore in the world switch.
+ */
+ if (likely(cpu_if->vgic_sre))
+ kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
+
+ kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+
+ if (has_vhe())
+ __vgic_v3_activate_traps(cpu_if);
+
+ WARN_ON(vgic_v4_load(vcpu));
+}
+
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ if (likely(cpu_if->vgic_sre))
+ cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
+}
+
+void vgic_v3_put(struct kvm_vcpu *vcpu)
+{
+ struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ WARN_ON(vgic_v4_put(vcpu, false));
+
+ vgic_v3_vmcr_sync(vcpu);
+
+ kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+
+ if (has_vhe())
+ __vgic_v3_deactivate_traps(cpu_if);
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
new file mode 100644
index 000000000000..27ac833e5ec7
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kvm_host.h>
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include "vgic.h"
+
+/*
+ * How KVM uses GICv4 (insert rude comments here):
+ *
+ * The vgic-v4 layer acts as a bridge between several entities:
+ * - The GICv4 ITS representation offered by the ITS driver
+ * - VFIO, which is in charge of the PCI endpoint
+ * - The virtual ITS, which is the only thing the guest sees
+ *
+ * The configuration of VLPIs is triggered by a callback from VFIO,
+ * instructing KVM that a PCI device has been configured to deliver
+ * MSIs to a vITS.
+ *
+ * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
+ * and this is used to find the corresponding vITS data structures
+ * (ITS instance, device, event and irq) using a process that is
+ * extremely similar to the injection of an MSI.
+ *
+ * At this stage, we can link the guest's view of an LPI (uniquely
+ * identified by the routing entry) and the host irq, using the GICv4
+ * driver mapping operation. Should the mapping succeed, we've then
+ * successfully upgraded the guest's LPI to a VLPI. We can then start
+ * with updating GICv4's view of the property table and generating an
+ * INValidation in order to kickstart the delivery of this VLPI to the
+ * guest directly, without software intervention. Well, almost.
+ *
+ * When the PCI endpoint is deconfigured, this operation is reversed
+ * with VFIO calling kvm_vgic_v4_unset_forwarding().
+ *
+ * Once the VLPI has been mapped, it needs to follow any change the
+ * guest performs on its LPI through the vITS. For that, a number of
+ * command handlers have hooks to communicate these changes to the HW:
+ * - Any invalidation triggers a call to its_prop_update_vlpi()
+ * - The INT command results in a irq_set_irqchip_state(), which
+ * generates an INT on the corresponding VLPI.
+ * - The CLEAR command results in a irq_set_irqchip_state(), which
+ * generates an CLEAR on the corresponding VLPI.
+ * - DISCARD translates into an unmap, similar to a call to
+ * kvm_vgic_v4_unset_forwarding().
+ * - MOVI is translated by an update of the existing mapping, changing
+ * the target vcpu, resulting in a VMOVI being generated.
+ * - MOVALL is translated by a string of mapping updates (similar to
+ * the handling of MOVI). MOVALL is horrible.
+ *
+ * Note that a DISCARD/MAPTI sequence emitted from the guest without
+ * reprogramming the PCI endpoint after MAPTI does not result in a
+ * VLPI being mapped, as there is no callback from VFIO (the guest
+ * will get the interrupt via the normal SW injection). Fixing this is
+ * not trivial, and requires some horrible messing with the VFIO
+ * internals. Not fun. Don't do that.
+ *
+ * Then there is the scheduling. Each time a vcpu is about to run on a
+ * physical CPU, KVM must tell the corresponding redistributor about
+ * it. And if we've migrated our vcpu from one CPU to another, we must
+ * tell the ITS (so that the messages reach the right redistributor).
+ * This is done in two steps: first issue a irq_set_affinity() on the
+ * irq corresponding to the vcpu, then call its_make_vpe_resident().
+ * You must be in a non-preemptible context. On exit, a call to
+ * its_make_vpe_non_resident() tells the redistributor that we're done
+ * with the vcpu.
+ *
+ * Finally, the doorbell handling: Each vcpu is allocated an interrupt
+ * which will fire each time a VLPI is made pending whilst the vcpu is
+ * not running. Each time the vcpu gets blocked, the doorbell
+ * interrupt gets enabled. When the vcpu is unblocked (for whatever
+ * reason), the doorbell interrupt is disabled.
+ */
+
+#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)
+
+static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
+{
+ struct kvm_vcpu *vcpu = info;
+
+ /* We got the message, no need to fire again */
+ if (!kvm_vgic_global_state.has_gicv4_1 &&
+ !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
+ disable_irq_nosync(irq);
+
+ vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return IRQ_HANDLED;
+}
+
+static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
+{
+ vpe->sgi_config[irq->intid].enabled = irq->enabled;
+ vpe->sgi_config[irq->intid].group = irq->group;
+ vpe->sgi_config[irq->intid].priority = irq->priority;
+}
+
+static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
+{
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ int i;
+
+ /*
+ * With GICv4.1, every virtual SGI can be directly injected. So
+ * let's pretend that they are HW interrupts, tied to a host
+ * IRQ. The SGI code will do its magic.
+ */
+ for (i = 0; i < VGIC_NR_SGIS; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct irq_desc *desc;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (irq->hw)
+ goto unlock;
+
+ irq->hw = true;
+ irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);
+
+ /* Transfer the full irq state to the vPE */
+ vgic_v4_sync_sgi_config(vpe, irq);
+ desc = irq_to_desc(irq->host_irq);
+ ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
+ false);
+ if (!WARN_ON(ret)) {
+ /* Transfer pending state */
+ ret = irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ irq->pending_latch);
+ WARN_ON(ret);
+ irq->pending_latch = false;
+ }
+ unlock:
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ for (i = 0; i < VGIC_NR_SGIS; i++) {
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, i);
+ struct irq_desc *desc;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (!irq->hw)
+ goto unlock;
+
+ irq->hw = false;
+ ret = irq_get_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ &irq->pending_latch);
+ WARN_ON(ret);
+
+ desc = irq_to_desc(irq->host_irq);
+ irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
+ unlock:
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+}
+
+/* Must be called with the kvm lock held */
+void vgic_v4_configure_vsgis(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_arm_halt_guest(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (dist->nassgireq)
+ vgic_v4_enable_vsgis(vcpu);
+ else
+ vgic_v4_disable_vsgis(vcpu);
+ }
+
+ kvm_arm_resume_guest(kvm);
+}
+
+/**
+ * vgic_v4_init - Initialize the GICv4 data structures
+ * @kvm: Pointer to the VM being initialized
+ *
+ * We may be called each time a vITS is created, or when the
+ * vgic is initialized. This relies on kvm->lock to be
+ * held. In both cases, the number of vcpus should now be
+ * fixed.
+ */
+int vgic_v4_init(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct kvm_vcpu *vcpu;
+ int i, nr_vcpus, ret;
+
+ if (!kvm_vgic_global_state.has_gicv4)
+ return 0; /* Nothing to see here... move along. */
+
+ if (dist->its_vm.vpes)
+ return 0;
+
+ nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+ dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
+ GFP_KERNEL);
+ if (!dist->its_vm.vpes)
+ return -ENOMEM;
+
+ dist->its_vm.nr_vpes = nr_vcpus;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+ ret = its_alloc_vcpu_irqs(&dist->its_vm);
+ if (ret < 0) {
+ kvm_err("VPE IRQ allocation failure\n");
+ kfree(dist->its_vm.vpes);
+ dist->its_vm.nr_vpes = 0;
+ dist->its_vm.vpes = NULL;
+ return ret;
+ }
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ int irq = dist->its_vm.vpes[i]->irq;
+ unsigned long irq_flags = DB_IRQ_FLAGS;
+
+ /*
+ * Don't automatically enable the doorbell, as we're
+ * flipping it back and forth when the vcpu gets
+ * blocked. Also disable the lazy disabling, as the
+ * doorbell could kick us out of the guest too
+ * early...
+ *
+ * On GICv4.1, the doorbell is managed in HW and must
+ * be left enabled.
+ */
+ if (kvm_vgic_global_state.has_gicv4_1)
+ irq_flags &= ~IRQ_NOAUTOEN;
+ irq_set_status_flags(irq, irq_flags);
+
+ ret = request_irq(irq, vgic_v4_doorbell_handler,
+ 0, "vcpu", vcpu);
+ if (ret) {
+ kvm_err("failed to allocate vcpu IRQ%d\n", irq);
+ /*
+ * Trick: adjust the number of vpes so we know
+ * how many to nuke on teardown...
+ */
+ dist->its_vm.nr_vpes = i;
+ break;
+ }
+ }
+
+ if (ret)
+ vgic_v4_teardown(kvm);
+
+ return ret;
+}
+
+/**
+ * vgic_v4_teardown - Free the GICv4 data structures
+ * @kvm: Pointer to the VM being destroyed
+ *
+ * Relies on kvm->lock to be held.
+ */
+void vgic_v4_teardown(struct kvm *kvm)
+{
+ struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
+ int i;
+
+ if (!its_vm->vpes)
+ return;
+
+ for (i = 0; i < its_vm->nr_vpes; i++) {
+ struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
+ int irq = its_vm->vpes[i]->irq;
+
+ irq_clear_status_flags(irq, DB_IRQ_FLAGS);
+ free_irq(irq, vcpu);
+ }
+
+ its_free_vcpu_irqs(its_vm);
+ kfree(its_vm->vpes);
+ its_vm->nr_vpes = 0;
+ its_vm->vpes = NULL;
+}
+
+int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db)
+{
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+ if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
+ return 0;
+
+ return its_make_vpe_non_resident(vpe, need_db);
+}
+
+int vgic_v4_load(struct kvm_vcpu *vcpu)
+{
+ struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ int err;
+
+ if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
+ return 0;
+
+ /*
+ * Before making the VPE resident, make sure the redistributor
+ * corresponding to our current CPU expects us here. See the
+ * doc in drivers/irqchip/irq-gic-v4.c to understand how this
+ * turns into a VMOVP command at the ITS level.
+ */
+ err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
+ if (err)
+ return err;
+
+ err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
+ if (err)
+ return err;
+
+ /*
+ * Now that the VPE is resident, let's get rid of a potential
+ * doorbell interrupt that would still be pending. This is a
+ * GICv4.0 only "feature"...
+ */
+ if (!kvm_vgic_global_state.has_gicv4_1)
+ err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);
+
+ return err;
+}
+
+static struct vgic_its *vgic_get_its(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+ struct kvm_msi msi = (struct kvm_msi) {
+ .address_lo = irq_entry->msi.address_lo,
+ .address_hi = irq_entry->msi.address_hi,
+ .data = irq_entry->msi.data,
+ .flags = irq_entry->msi.flags,
+ .devid = irq_entry->msi.devid,
+ };
+
+ return vgic_msi_to_its(kvm, &msi);
+}
+
+int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
+ struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+ struct vgic_its *its;
+ struct vgic_irq *irq;
+ struct its_vlpi_map map;
+ int ret;
+
+ if (!vgic_supports_direct_msis(kvm))
+ return 0;
+
+ /*
+ * Get the ITS, and escape early on error (not a valid
+ * doorbell for any of our vITSs).
+ */
+ its = vgic_get_its(kvm, irq_entry);
+ if (IS_ERR(its))
+ return 0;
+
+ mutex_lock(&its->its_lock);
+
+ /* Perform the actual DevID/EventID -> LPI translation. */
+ ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+ irq_entry->msi.data, &irq);
+ if (ret)
+ goto out;
+
+ /*
+ * Emit the mapping request. If it fails, the ITS probably
+ * isn't v4 compatible, so let's silently bail out. Holding
+ * the ITS lock should ensure that nothing can modify the
+ * target vcpu.
+ */
+ map = (struct its_vlpi_map) {
+ .vm = &kvm->arch.vgic.its_vm,
+ .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
+ .vintid = irq->intid,
+ .properties = ((irq->priority & 0xfc) |
+ (irq->enabled ? LPI_PROP_ENABLED : 0) |
+ LPI_PROP_GROUP1),
+ .db_enabled = true,
+ };
+
+ ret = its_map_vlpi(virq, &map);
+ if (ret)
+ goto out;
+
+ irq->hw = true;
+ irq->host_irq = virq;
+ atomic_inc(&map.vpe->vlpi_count);
+
+out:
+ mutex_unlock(&its->its_lock);
+ return ret;
+}
+
+int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
+ struct kvm_kernel_irq_routing_entry *irq_entry)
+{
+ struct vgic_its *its;
+ struct vgic_irq *irq;
+ int ret;
+
+ if (!vgic_supports_direct_msis(kvm))
+ return 0;
+
+ /*
+ * Get the ITS, and escape early on error (not a valid
+ * doorbell for any of our vITSs).
+ */
+ its = vgic_get_its(kvm, irq_entry);
+ if (IS_ERR(its))
+ return 0;
+
+ mutex_lock(&its->its_lock);
+
+ ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
+ irq_entry->msi.data, &irq);
+ if (ret)
+ goto out;
+
+ WARN_ON(!(irq->hw && irq->host_irq == virq));
+ if (irq->hw) {
+ atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
+ irq->hw = false;
+ ret = its_unmap_vlpi(virq);
+ }
+
+out:
+ mutex_unlock(&its->its_lock);
+ return ret;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
new file mode 100644
index 000000000000..c3643b7f101b
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -0,0 +1,1020 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/list_sort.h>
+#include <linux/nospec.h>
+
+#include <asm/kvm_hyp.h>
+
+#include "vgic.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+struct vgic_global kvm_vgic_global_state __ro_after_init = {
+ .gicv3_cpuif = STATIC_KEY_FALSE_INIT,
+};
+
+/*
+ * Locking order is always:
+ * kvm->lock (mutex)
+ * its->cmd_lock (mutex)
+ * its->its_lock (mutex)
+ * vgic_cpu->ap_list_lock must be taken with IRQs disabled
+ * kvm->lpi_list_lock must be taken with IRQs disabled
+ * vgic_irq->irq_lock must be taken with IRQs disabled
+ *
+ * As the ap_list_lock might be taken from the timer interrupt handler,
+ * we have to disable IRQs before taking this lock and everything lower
+ * than it.
+ *
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
+ *
+ * When taking more than one ap_list_lock at the same time, always take the
+ * lowest numbered VCPU's ap_list_lock first, so:
+ * vcpuX->vcpu_id < vcpuY->vcpu_id:
+ * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ *
+ * Since the VGIC must support injecting virtual interrupts from ISRs, we have
+ * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
+ * spinlocks for any lock that may be taken while injecting an interrupt.
+ */
+
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = NULL;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ if (irq->intid != intid)
+ continue;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() later once it's finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+ goto out_unlock;
+ }
+ irq = NULL;
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+ return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ u32 intid)
+{
+ /* SGIs and PPIs */
+ if (intid <= VGIC_MAX_PRIVATE) {
+ intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
+ return &vcpu->arch.vgic_cpu.private_irqs[intid];
+ }
+
+ /* SPIs */
+ if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
+ return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
+ }
+
+ /* LPIs */
+ if (intid >= VGIC_MIN_LPI)
+ return vgic_get_lpi(kvm, intid);
+
+ WARN(1, "Looking up struct vgic_irq for reserved INTID");
+ return NULL;
+}
+
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+/*
+ * Drop the refcount on the LPI. Must be called with lpi_list_lock held.
+ */
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ if (!kref_put(&irq->refcount, vgic_irq_release))
+ return;
+
+ list_del(&irq->lpi_list);
+ dist->lpi_list_count--;
+
+ kfree(irq);
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long flags;
+
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ __vgic_put_lpi_locked(kvm, irq);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+}
+
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq, *tmp;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+ list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+ if (irq->intid >= VGIC_MIN_LPI) {
+ raw_spin_lock(&irq->irq_lock);
+ list_del(&irq->ap_list);
+ irq->vcpu = NULL;
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
+{
+ WARN_ON(irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ pending));
+}
+
+bool vgic_get_phys_line_level(struct vgic_irq *irq)
+{
+ bool line_level;
+
+ BUG_ON(!irq->hw);
+
+ if (irq->get_input_level)
+ return irq->get_input_level(irq->intid);
+
+ WARN_ON(irq_get_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_PENDING,
+ &line_level));
+ return line_level;
+}
+
+/* Set/Clear the physical active state */
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
+{
+
+ BUG_ON(!irq->hw);
+ WARN_ON(irq_set_irqchip_state(irq->host_irq,
+ IRQCHIP_STATE_ACTIVE,
+ active));
+}
+
+/**
+ * kvm_vgic_target_oracle - compute the target vcpu for an irq
+ *
+ * @irq: The irq to route. Must be already locked.
+ *
+ * Based on the current state of the interrupt (enabled, pending,
+ * active, vcpu and target_vcpu), compute the next vcpu this should be
+ * given to. Return NULL if this shouldn't be injected at all.
+ *
+ * Requires the IRQ lock to be held.
+ */
+static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+{
+ lockdep_assert_held(&irq->irq_lock);
+
+ /* If the interrupt is active, it must stay on the current vcpu */
+ if (irq->active)
+ return irq->vcpu ? : irq->target_vcpu;
+
+ /*
+ * If the IRQ is not active but enabled and pending, we should direct
+ * it to its configured target VCPU.
+ * If the distributor is disabled, pending interrupts shouldn't be
+ * forwarded.
+ */
+ if (irq->enabled && irq_is_pending(irq)) {
+ if (unlikely(irq->target_vcpu &&
+ !irq->target_vcpu->kvm->arch.vgic.enabled))
+ return NULL;
+
+ return irq->target_vcpu;
+ }
+
+ /* If neither active nor pending and enabled, then this IRQ should not
+ * be queued to any VCPU.
+ */
+ return NULL;
+}
+
+/*
+ * The order of items in the ap_lists defines how we'll pack things in LRs as
+ * well, the first items in the list being the first things populated in the
+ * LRs.
+ *
+ * A hard rule is that active interrupts can never be pushed out of the LRs
+ * (and therefore take priority) since we cannot reliably trap on deactivation
+ * of IRQs and therefore they have to be present in the LRs.
+ *
+ * Otherwise things should be sorted by the priority field and the GIC
+ * hardware support will take care of preemption of priority groups etc.
+ *
+ * Return negative if "a" sorts before "b", 0 to preserve order, and positive
+ * to sort "b" before "a".
+ */
+static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
+ struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+ bool penda, pendb;
+ int ret;
+
+ /*
+ * list_sort may call this function with the same element when
+ * the list is fairly long.
+ */
+ if (unlikely(irqa == irqb))
+ return 0;
+
+ raw_spin_lock(&irqa->irq_lock);
+ raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+
+ if (irqa->active || irqb->active) {
+ ret = (int)irqb->active - (int)irqa->active;
+ goto out;
+ }
+
+ penda = irqa->enabled && irq_is_pending(irqa);
+ pendb = irqb->enabled && irq_is_pending(irqb);
+
+ if (!penda || !pendb) {
+ ret = (int)pendb - (int)penda;
+ goto out;
+ }
+
+ /* Both pending and enabled, sort by priority */
+ ret = irqa->priority - irqb->priority;
+out:
+ raw_spin_unlock(&irqb->irq_lock);
+ raw_spin_unlock(&irqa->irq_lock);
+ return ret;
+}
+
+/* Must be called with the ap_list_lock held */
+static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+ list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+}
+
+/*
+ * Only valid injection if changing level for level-triggered IRQs or for a
+ * rising edge, and in-kernel connected IRQ lines can only be controlled by
+ * their owner.
+ */
+static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
+{
+ if (irq->owner != owner)
+ return false;
+
+ switch (irq->config) {
+ case VGIC_CONFIG_LEVEL:
+ return irq->line_level != level;
+ case VGIC_CONFIG_EDGE:
+ return level;
+ }
+
+ return false;
+}
+
+/*
+ * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
+ * Do the queuing if necessary, taking the right locks in the right order.
+ * Returns true when the IRQ was queued, false otherwise.
+ *
+ * Needs to be entered with the IRQ lock already held, but will return
+ * with all locks dropped.
+ */
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+ unsigned long flags)
+{
+ struct kvm_vcpu *vcpu;
+
+ lockdep_assert_held(&irq->irq_lock);
+
+retry:
+ vcpu = vgic_target_oracle(irq);
+ if (irq->vcpu || !vcpu) {
+ /*
+ * If this IRQ is already on a VCPU's ap_list, then it
+ * cannot be moved or modified and there is no more work for
+ * us to do.
+ *
+ * Otherwise, if the irq is not pending and enabled, it does
+ * not need to be inserted into an ap_list and there is also
+ * no more work for us to do.
+ */
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ /*
+ * We have to kick the VCPU here, because we could be
+ * queueing an edge-triggered interrupt for which we
+ * get no EOI maintenance interrupt. In that case,
+ * while the IRQ is already on the VCPU's AP list, the
+ * VCPU could have EOI'ed the original interrupt and
+ * won't see this one until it exits for some other
+ * reason.
+ */
+ if (vcpu) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ return false;
+ }
+
+ /*
+ * We must unlock the irq lock to take the ap_list_lock where
+ * we are going to insert this new pending interrupt.
+ */
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ /* someone can do stuff here, which we re-check below */
+
+ raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+ raw_spin_lock(&irq->irq_lock);
+
+ /*
+ * Did something change behind our backs?
+ *
+ * There are two cases:
+ * 1) The irq lost its pending state or was disabled behind our
+ * backs and/or it was queued to another VCPU's ap_list.
+ * 2) Someone changed the affinity on this irq behind our
+ * backs and we are now holding the wrong ap_list_lock.
+ *
+ * In both cases, drop the locks and retry.
+ */
+
+ if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
+ flags);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ goto retry;
+ }
+
+ /*
+ * Grab a reference to the irq to reflect the fact that it is
+ * now in the ap_list.
+ */
+ vgic_get_irq_kref(irq);
+ list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
+ irq->vcpu = vcpu;
+
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return true;
+}
+
+/**
+ * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
+ * @kvm: The VM structure pointer
+ * @cpuid: The CPU for PPIs
+ * @intid: The INTID to inject a new state to.
+ * @level: Edge-triggered: true: to trigger the interrupt
+ * false: to ignore the call
+ * Level-sensitive true: raise the input signal
+ * false: lower the input signal
+ * @owner: The opaque pointer to the owner of the IRQ being raised to verify
+ * that the caller is allowed to inject this IRQ. Userspace
+ * injections will have owner == NULL.
+ *
+ * The VGIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts. You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+ bool level, void *owner)
+{
+ struct kvm_vcpu *vcpu;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ int ret;
+
+ trace_vgic_update_irq_pending(cpuid, intid, level);
+
+ ret = vgic_lazy_init(kvm);
+ if (ret)
+ return ret;
+
+ vcpu = kvm_get_vcpu(kvm, cpuid);
+ if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
+ return -EINVAL;
+
+ irq = vgic_get_irq(kvm, vcpu, intid);
+ if (!irq)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+
+ if (!vgic_validate_injection(irq, level, owner)) {
+ /* Nothing to see here, move along... */
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(kvm, irq);
+ return 0;
+ }
+
+ if (irq->config == VGIC_CONFIG_LEVEL)
+ irq->line_level = level;
+ else
+ irq->pending_latch = true;
+
+ vgic_queue_irq_unlock(kvm, irq, flags);
+ vgic_put_irq(kvm, irq);
+
+ return 0;
+}
+
+/* @irq->irq_lock must be held */
+static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
+ unsigned int host_irq,
+ bool (*get_input_level)(int vindid))
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+
+ /*
+ * Find the physical IRQ number corresponding to @host_irq
+ */
+ desc = irq_to_desc(host_irq);
+ if (!desc) {
+ kvm_err("%s: no interrupt descriptor\n", __func__);
+ return -EINVAL;
+ }
+ data = irq_desc_get_irq_data(desc);
+ while (data->parent_data)
+ data = data->parent_data;
+
+ irq->hw = true;
+ irq->host_irq = host_irq;
+ irq->hwintid = data->hwirq;
+ irq->get_input_level = get_input_level;
+ return 0;
+}
+
+/* @irq->irq_lock must be held */
+static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
+{
+ irq->hw = false;
+ irq->hwintid = 0;
+ irq->get_input_level = NULL;
+}
+
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
+ u32 vintid, bool (*get_input_level)(int vindid))
+{
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ unsigned long flags;
+ int ret;
+
+ BUG_ON(!irq);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ return ret;
+}
+
+/**
+ * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
+ * @vcpu: The VCPU pointer
+ * @vintid: The INTID of the interrupt
+ *
+ * Reset the active and pending states of a mapped interrupt. Kernel
+ * subsystems injecting mapped interrupts should reset their interrupt lines
+ * when we are doing a reset of the VM.
+ */
+void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
+{
+ struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ unsigned long flags;
+
+ if (!irq->hw)
+ goto out;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ irq->active = false;
+ irq->pending_latch = false;
+ irq->line_level = false;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+out:
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+
+ if (!vgic_initialized(vcpu->kvm))
+ return -EAGAIN;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ BUG_ON(!irq);
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ kvm_vgic_unmap_irq(irq);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ return 0;
+}
+
+/**
+ * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
+ *
+ * @vcpu: Pointer to the VCPU (used for PPIs)
+ * @intid: The virtual INTID identifying the interrupt (PPI or SPI)
+ * @owner: Opaque pointer to the owner
+ *
+ * Returns 0 if intid is not already used by another in-kernel device and the
+ * owner is set, otherwise returns an error code.
+ */
+int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!vgic_initialized(vcpu->kvm))
+ return -EAGAIN;
+
+ /* SGIs and LPIs cannot be wired up to any device */
+ if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
+ return -EINVAL;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ if (irq->owner && irq->owner != owner)
+ ret = -EEXIST;
+ else
+ irq->owner = owner;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+ return ret;
+}
+
+/**
+ * vgic_prune_ap_list - Remove non-relevant interrupts from the list
+ *
+ * @vcpu: The VCPU pointer
+ *
+ * Go over the list of "interesting" interrupts, and prune those that we
+ * won't have to consider in the near future.
+ */
+static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq, *tmp;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+retry:
+ raw_spin_lock(&vgic_cpu->ap_list_lock);
+
+ list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+ struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
+ bool target_vcpu_needs_kick = false;
+
+ raw_spin_lock(&irq->irq_lock);
+
+ BUG_ON(vcpu != irq->vcpu);
+
+ target_vcpu = vgic_target_oracle(irq);
+
+ if (!target_vcpu) {
+ /*
+ * We don't need to process this interrupt any
+ * further, move it off the list.
+ */
+ list_del(&irq->ap_list);
+ irq->vcpu = NULL;
+ raw_spin_unlock(&irq->irq_lock);
+
+ /*
+ * This vgic_put_irq call matches the
+ * vgic_get_irq_kref in vgic_queue_irq_unlock,
+ * where we added the LPI to the ap_list. As
+ * we remove the irq from the list, we drop
+ * also drop the refcount.
+ */
+ vgic_put_irq(vcpu->kvm, irq);
+ continue;
+ }
+
+ if (target_vcpu == vcpu) {
+ /* We're on the right CPU */
+ raw_spin_unlock(&irq->irq_lock);
+ continue;
+ }
+
+ /* This interrupt looks like it has to be migrated. */
+
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&vgic_cpu->ap_list_lock);
+
+ /*
+ * Ensure locking order by always locking the smallest
+ * ID first.
+ */
+ if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
+ vcpuA = vcpu;
+ vcpuB = target_vcpu;
+ } else {
+ vcpuA = target_vcpu;
+ vcpuB = vcpu;
+ }
+
+ raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+ raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+ SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&irq->irq_lock);
+
+ /*
+ * If the affinity has been preserved, move the
+ * interrupt around. Otherwise, it means things have
+ * changed while the interrupt was unlocked, and we
+ * need to replay this.
+ *
+ * In all cases, we cannot trust the list not to have
+ * changed, so we restart from the beginning.
+ */
+ if (target_vcpu == vgic_target_oracle(irq)) {
+ struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;
+
+ list_del(&irq->ap_list);
+ irq->vcpu = target_vcpu;
+ list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
+ target_vcpu_needs_kick = true;
+ }
+
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+ raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+
+ if (target_vcpu_needs_kick) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
+ kvm_vcpu_kick(target_vcpu);
+ }
+
+ goto retry;
+ }
+
+ raw_spin_unlock(&vgic_cpu->ap_list_lock);
+}
+
+static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_fold_lr_state(vcpu);
+ else
+ vgic_v3_fold_lr_state(vcpu);
+}
+
+/* Requires the irq_lock to be held. */
+static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
+ struct vgic_irq *irq, int lr)
+{
+ lockdep_assert_held(&irq->irq_lock);
+
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_populate_lr(vcpu, irq, lr);
+ else
+ vgic_v3_populate_lr(vcpu, irq, lr);
+}
+
+static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_clear_lr(vcpu, lr);
+ else
+ vgic_v3_clear_lr(vcpu, lr);
+}
+
+static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_set_underflow(vcpu);
+ else
+ vgic_v3_set_underflow(vcpu);
+}
+
+/* Requires the ap_list_lock to be held. */
+static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
+ bool *multi_sgi)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq;
+ int count = 0;
+
+ *multi_sgi = false;
+
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ int w;
+
+ raw_spin_lock(&irq->irq_lock);
+ /* GICv2 SGIs can count for more than one... */
+ w = vgic_irq_get_lr_count(irq);
+ raw_spin_unlock(&irq->irq_lock);
+
+ count += w;
+ *multi_sgi |= (w > 1);
+ }
+ return count;
+}
+
+/* Requires the VCPU's ap_list_lock to be held. */
+static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq;
+ int count;
+ bool multi_sgi;
+ u8 prio = 0xff;
+ int i = 0;
+
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
+
+ count = compute_ap_list_depth(vcpu, &multi_sgi);
+ if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
+ vgic_sort_ap_list(vcpu);
+
+ count = 0;
+
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ raw_spin_lock(&irq->irq_lock);
+
+ /*
+ * If we have multi-SGIs in the pipeline, we need to
+ * guarantee that they are all seen before any IRQ of
+ * lower priority. In that case, we need to filter out
+ * these interrupts by exiting early. This is easy as
+ * the AP list has been sorted already.
+ */
+ if (multi_sgi && irq->priority > prio) {
+ _raw_spin_unlock(&irq->irq_lock);
+ break;
+ }
+
+ if (likely(vgic_target_oracle(irq) == vcpu)) {
+ vgic_populate_lr(vcpu, irq, count++);
+
+ if (irq->source)
+ prio = irq->priority;
+ }
+
+ raw_spin_unlock(&irq->irq_lock);
+
+ if (count == kvm_vgic_global_state.nr_lr) {
+ if (!list_is_last(&irq->ap_list,
+ &vgic_cpu->ap_list_head))
+ vgic_set_underflow(vcpu);
+ break;
+ }
+ }
+
+ /* Nuke remaining LRs */
+ for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
+ vgic_clear_lr(vcpu, i);
+
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
+ else
+ vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
+}
+
+static inline bool can_access_vgic_from_kernel(void)
+{
+ /*
+ * GICv2 can always be accessed from the kernel because it is
+ * memory-mapped, and VHE systems can access GICv3 EL2 system
+ * registers.
+ */
+ return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
+}
+
+static inline void vgic_save_state(struct kvm_vcpu *vcpu)
+{
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v2_save_state(vcpu);
+ else
+ __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
+}
+
+/* Sync back the hardware VGIC state into our emulation after a guest's run. */
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
+{
+ int used_lrs;
+
+ /* An empty ap_list_head implies used_lrs == 0 */
+ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+ return;
+
+ if (can_access_vgic_from_kernel())
+ vgic_save_state(vcpu);
+
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
+ else
+ used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+
+ if (used_lrs)
+ vgic_fold_lr_state(vcpu);
+ vgic_prune_ap_list(vcpu);
+}
+
+static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ vgic_v2_restore_state(vcpu);
+ else
+ __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
+}
+
+/* Flush our emulation state into the GIC hardware before entering the guest. */
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If there are no virtual interrupts active or pending for this
+ * VCPU, then there is no work to do and we can bail out without
+ * taking any lock. There is a potential race with someone injecting
+ * interrupts to the VCPU, but it is a benign race as the VCPU will
+ * either observe the new interrupt before or after doing this check,
+ * and introducing additional synchronization mechanism doesn't change
+ * this.
+ *
+ * Note that we still need to go through the whole thing if anything
+ * can be directly injected (GICv4).
+ */
+ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
+ !vgic_supports_direct_msis(vcpu->kvm))
+ return;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
+ raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ vgic_flush_lr_state(vcpu);
+ raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ }
+
+ if (can_access_vgic_from_kernel())
+ vgic_restore_state(vcpu);
+}
+
+void kvm_vgic_load(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!vgic_initialized(vcpu->kvm)))
+ return;
+
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_load(vcpu);
+ else
+ vgic_v3_load(vcpu);
+}
+
+void kvm_vgic_put(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!vgic_initialized(vcpu->kvm)))
+ return;
+
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_put(vcpu);
+ else
+ vgic_v3_put(vcpu);
+}
+
+void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
+ return;
+
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_vmcr_sync(vcpu);
+ else
+ vgic_v3_vmcr_sync(vcpu);
+}
+
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq;
+ bool pending = false;
+ unsigned long flags;
+ struct vgic_vmcr vmcr;
+
+ if (!vcpu->kvm->arch.vgic.enabled)
+ return false;
+
+ if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
+ return true;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ raw_spin_lock(&irq->irq_lock);
+ pending = irq_is_pending(irq) && irq->enabled &&
+ !irq->active &&
+ irq->priority < vmcr.pmr;
+ raw_spin_unlock(&irq->irq_lock);
+
+ if (pending)
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+
+ return pending;
+}
+
+void vgic_kick_vcpus(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int c;
+
+ /*
+ * We've injected an interrupt, time to find out who deserves
+ * a good kick...
+ */
+ kvm_for_each_vcpu(c, vcpu, kvm) {
+ if (kvm_vgic_vcpu_pending_irq(vcpu)) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+ }
+}
+
+bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
+{
+ struct vgic_irq *irq;
+ bool map_is_active;
+ unsigned long flags;
+
+ if (!vgic_initialized(vcpu->kvm))
+ return false;
+
+ irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ map_is_active = irq->hw && irq->active;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(vcpu->kvm, irq);
+
+ return map_is_active;
+}
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
new file mode 100644
index 000000000000..64fcd7511110
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2015, 2016 ARM Ltd.
+ */
+#ifndef __KVM_ARM_VGIC_NEW_H__
+#define __KVM_ARM_VGIC_NEW_H__
+
+#include <linux/irqchip/arm-gic-common.h>
+
+#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
+#define IMPLEMENTER_ARM 0x43b
+
+#define VGIC_ADDR_UNDEF (-1)
+#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
+
+#define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS 16
+#define VGIC_PRI_BITS 5
+
+#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
+
+#define VGIC_AFFINITY_0_SHIFT 0
+#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
+#define VGIC_AFFINITY_1_SHIFT 8
+#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
+#define VGIC_AFFINITY_2_SHIFT 16
+#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
+#define VGIC_AFFINITY_3_SHIFT 24
+#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)
+
+#define VGIC_AFFINITY_LEVEL(reg, level) \
+ ((((reg) & VGIC_AFFINITY_## level ##_MASK) \
+ >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
+
+/*
+ * The Userspace encodes the affinity differently from the MPIDR,
+ * Below macro converts vgic userspace format to MPIDR reg format.
+ */
+#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
+ VGIC_AFFINITY_LEVEL(val, 1) | \
+ VGIC_AFFINITY_LEVEL(val, 2) | \
+ VGIC_AFFINITY_LEVEL(val, 3))
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst,
+ * below macros are defined for CPUREG encoding.
+ */
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000
+#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT 14
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK 0x0000000000003800
+#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT 11
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK 0x0000000000000780
+#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT 7
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK 0x0000000000000078
+#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT 3
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK 0x0000000000000007
+#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT 0
+
+#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
+ KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
+ KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
+ KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
+ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)
+
+/*
+ * As per Documentation/virt/kvm/devices/arm-vgic-its.rst,
+ * below macros are defined for ITS table entry encoding.
+ */
+#define KVM_ITS_CTE_VALID_SHIFT 63
+#define KVM_ITS_CTE_VALID_MASK BIT_ULL(63)
+#define KVM_ITS_CTE_RDBASE_SHIFT 16
+#define KVM_ITS_CTE_ICID_MASK GENMASK_ULL(15, 0)
+#define KVM_ITS_ITE_NEXT_SHIFT 48
+#define KVM_ITS_ITE_PINTID_SHIFT 16
+#define KVM_ITS_ITE_PINTID_MASK GENMASK_ULL(47, 16)
+#define KVM_ITS_ITE_ICID_MASK GENMASK_ULL(15, 0)
+#define KVM_ITS_DTE_VALID_SHIFT 63
+#define KVM_ITS_DTE_VALID_MASK BIT_ULL(63)
+#define KVM_ITS_DTE_NEXT_SHIFT 49
+#define KVM_ITS_DTE_NEXT_MASK GENMASK_ULL(62, 49)
+#define KVM_ITS_DTE_ITTADDR_SHIFT 5
+#define KVM_ITS_DTE_ITTADDR_MASK GENMASK_ULL(48, 5)
+#define KVM_ITS_DTE_SIZE_MASK GENMASK_ULL(4, 0)
+#define KVM_ITS_L1E_VALID_MASK BIT_ULL(63)
+/* we only support 64 kB translation table page size */
+#define KVM_ITS_L1E_ADDR_MASK GENMASK_ULL(51, 16)
+
+#define KVM_VGIC_V3_RDIST_INDEX_MASK GENMASK_ULL(11, 0)
+#define KVM_VGIC_V3_RDIST_FLAGS_MASK GENMASK_ULL(15, 12)
+#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT 12
+#define KVM_VGIC_V3_RDIST_BASE_MASK GENMASK_ULL(51, 16)
+#define KVM_VGIC_V3_RDIST_COUNT_MASK GENMASK_ULL(63, 52)
+#define KVM_VGIC_V3_RDIST_COUNT_SHIFT 52
+
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
+#else
+#define DEBUG_SPINLOCK_BUG_ON(p)
+#endif
+
+/* Requires the irq_lock to be held by the caller. */
+static inline bool irq_is_pending(struct vgic_irq *irq)
+{
+ if (irq->config == VGIC_CONFIG_EDGE)
+ return irq->pending_latch;
+ else
+ return irq->pending_latch || irq->line_level;
+}
+
+static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
+{
+ return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
+}
+
+static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
+{
+ /* Account for the active state as an interrupt */
+ if (vgic_irq_is_sgi(irq->intid) && irq->source)
+ return hweight8(irq->source) + irq->active;
+
+ return irq_is_pending(irq) || irq->active;
+}
+
+static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
+{
+ return vgic_irq_get_lr_count(irq) > 1;
+}
+
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
+struct vgic_vmcr {
+ u32 grpen0;
+ u32 grpen1;
+
+ u32 ackctl;
+ u32 fiqen;
+ u32 cbpr;
+ u32 eoim;
+
+ u32 abpr;
+ u32 bpr;
+ u32 pmr; /* Priority mask field in the GICC_PMR and
+ * ICC_PMR_EL1 priority field format */
+};
+
+struct vgic_reg_attr {
+ struct kvm_vcpu *vcpu;
+ gpa_t addr;
+};
+
+int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+ struct vgic_reg_attr *reg_attr);
+int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
+ struct vgic_reg_attr *reg_attr);
+const struct vgic_register_region *
+vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
+ gpa_t addr, int len);
+struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ u32 intid);
+void __vgic_put_lpi_locked(struct kvm *kvm, struct vgic_irq *irq);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+bool vgic_get_phys_line_level(struct vgic_irq *irq);
+void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
+void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
+bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+ unsigned long flags);
+void vgic_kick_vcpus(struct kvm *kvm);
+
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment);
+
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_set_npie(struct kvm_vcpu *vcpu);
+int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val);
+int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val);
+void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v2_enable(struct kvm_vcpu *vcpu);
+int vgic_v2_probe(const struct gic_kvm_info *info);
+int vgic_v2_map_resources(struct kvm *kvm);
+int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
+ enum vgic_type);
+
+void vgic_v2_init_lrs(void);
+void vgic_v2_load(struct kvm_vcpu *vcpu);
+void vgic_v2_put(struct kvm_vcpu *vcpu);
+void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
+
+void vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ kref_get(&irq->refcount);
+}
+
+void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
+void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_set_npie(struct kvm_vcpu *vcpu);
+void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_v3_enable(struct kvm_vcpu *vcpu);
+int vgic_v3_probe(const struct gic_kvm_info *info);
+int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
+int vgic_v3_save_pending_tables(struct kvm *kvm);
+int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
+int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
+bool vgic_v3_check_base(struct kvm *kvm);
+
+void vgic_v3_load(struct kvm_vcpu *vcpu);
+void vgic_v3_put(struct kvm_vcpu *vcpu);
+void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
+
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
+int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val);
+int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ int offset, u32 *val);
+int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ u64 id, u64 *val);
+int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+ u64 *reg);
+int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
+ u32 intid, u64 *val);
+int kvm_register_vgic_device(unsigned long type);
+void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
+int vgic_lazy_init(struct kvm *kvm);
+int vgic_init(struct kvm *kvm);
+
+void vgic_debug_init(struct kvm *kvm);
+void vgic_debug_destroy(struct kvm *kvm);
+
+bool lock_all_vcpus(struct kvm *kvm);
+void unlock_all_vcpus(struct kvm *kvm);
+
+static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;
+
+ /*
+ * num_pri_bits are initialized with HW supported values.
+ * We can rely safely on num_pri_bits even if VM has not
+ * restored ICC_CTLR_EL1 before restoring APnR registers.
+ */
+ switch (cpu_if->num_pri_bits) {
+ case 7: return 3;
+ case 6: return 1;
+ default: return 0;
+ }
+}
+
+static inline bool
+vgic_v3_redist_region_full(struct vgic_redist_region *region)
+{
+ if (!region->count)
+ return false;
+
+ return (region->free_index >= region->count);
+}
+
+struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);
+
+static inline size_t
+vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
+{
+ if (!rdreg->count)
+ return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
+ else
+ return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
+}
+
+struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
+ u32 index);
+
+bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);
+
+static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
+{
+ struct vgic_dist *d = &kvm->arch.vgic;
+
+ return (base + size > d->vgic_dist_base) &&
+ (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
+}
+
+int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
+int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid, struct vgic_irq **irq);
+struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
+int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
+void vgic_lpi_translation_cache_init(struct kvm *kvm);
+void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
+void vgic_its_invalidate_cache(struct kvm *kvm);
+
+bool vgic_supports_direct_msis(struct kvm *kvm);
+int vgic_v4_init(struct kvm *kvm);
+void vgic_v4_teardown(struct kvm *kvm);
+void vgic_v4_configure_vsgis(struct kvm *kvm);
+
+#endif
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 8e25e89ad01f..0f8a3a9e3795 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -20,36 +20,36 @@
* x0 - bytes not copied
*/
- .macro ldrb1 ptr, regB, val
- uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
+ .macro ldrb1 reg, ptr, val
+ uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
.endm
- .macro strb1 ptr, regB, val
- strb \ptr, [\regB], \val
+ .macro strb1 reg, ptr, val
+ strb \reg, [\ptr], \val
.endm
- .macro ldrh1 ptr, regB, val
- uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
+ .macro ldrh1 reg, ptr, val
+ uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
.endm
- .macro strh1 ptr, regB, val
- strh \ptr, [\regB], \val
+ .macro strh1 reg, ptr, val
+ strh \reg, [\ptr], \val
.endm
- .macro ldr1 ptr, regB, val
- uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
+ .macro ldr1 reg, ptr, val
+ uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
.endm
- .macro str1 ptr, regB, val
- str \ptr, [\regB], \val
+ .macro str1 reg, ptr, val
+ str \reg, [\ptr], \val
.endm
- .macro ldp1 ptr, regB, regC, val
- uao_ldp 9998f, \ptr, \regB, \regC, \val
+ .macro ldp1 reg1, reg2, ptr, val
+ uao_ldp 9998f, \reg1, \reg2, \ptr, \val
.endm
- .macro stp1 ptr, regB, regC, val
- stp \ptr, \regB, [\regC], \val
+ .macro stp1 reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr], \val
.endm
end .req x5
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 667139013ed1..80e37ada0ee1 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -21,36 +21,36 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 ptr, regB, val
- uao_user_alternative 9998f, ldrb, ldtrb, \ptr, \regB, \val
+ .macro ldrb1 reg, ptr, val
+ uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val
.endm
- .macro strb1 ptr, regB, val
- uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
+ .macro strb1 reg, ptr, val
+ uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
.endm
- .macro ldrh1 ptr, regB, val
- uao_user_alternative 9998f, ldrh, ldtrh, \ptr, \regB, \val
+ .macro ldrh1 reg, ptr, val
+ uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val
.endm
- .macro strh1 ptr, regB, val
- uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
+ .macro strh1 reg, ptr, val
+ uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
.endm
- .macro ldr1 ptr, regB, val
- uao_user_alternative 9998f, ldr, ldtr, \ptr, \regB, \val
+ .macro ldr1 reg, ptr, val
+ uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val
.endm
- .macro str1 ptr, regB, val
- uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
+ .macro str1 reg, ptr, val
+ uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
.endm
- .macro ldp1 ptr, regB, regC, val
- uao_ldp 9998f, \ptr, \regB, \regC, \val
+ .macro ldp1 reg1, reg2, ptr, val
+ uao_ldp 9998f, \reg1, \reg2, \ptr, \val
.endm
- .macro stp1 ptr, regB, regC, val
- uao_stp 9998f, \ptr, \regB, \regC, \val
+ .macro stp1 reg1, reg2, ptr, val
+ uao_stp 9998f, \reg1, \reg2, \ptr, \val
.endm
end .req x5
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 1a104d0089f3..4ec59704b8f2 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -19,36 +19,36 @@
* Returns:
* x0 - bytes not copied
*/
- .macro ldrb1 ptr, regB, val
- ldrb \ptr, [\regB], \val
+ .macro ldrb1 reg, ptr, val
+ ldrb \reg, [\ptr], \val
.endm
- .macro strb1 ptr, regB, val
- uao_user_alternative 9998f, strb, sttrb, \ptr, \regB, \val
+ .macro strb1 reg, ptr, val
+ uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val
.endm
- .macro ldrh1 ptr, regB, val
- ldrh \ptr, [\regB], \val
+ .macro ldrh1 reg, ptr, val
+ ldrh \reg, [\ptr], \val
.endm
- .macro strh1 ptr, regB, val
- uao_user_alternative 9998f, strh, sttrh, \ptr, \regB, \val
+ .macro strh1 reg, ptr, val
+ uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val
.endm
- .macro ldr1 ptr, regB, val
- ldr \ptr, [\regB], \val
+ .macro ldr1 reg, ptr, val
+ ldr \reg, [\ptr], \val
.endm
- .macro str1 ptr, regB, val
- uao_user_alternative 9998f, str, sttr, \ptr, \regB, \val
+ .macro str1 reg, ptr, val
+ uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val
.endm
- .macro ldp1 ptr, regB, regC, val
- ldp \ptr, \regB, [\regC], \val
+ .macro ldp1 reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr], \val
.endm
- .macro stp1 ptr, regB, regC, val
- uao_stp 9998f, \ptr, \regB, \regC, \val
+ .macro stp1 reg1, reg2, ptr, val
+ uao_stp 9998f, \reg1, \reg2, \ptr, \val
.endm
end .req x5
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 243e107e9896..0f9e10ecda23 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -9,7 +9,7 @@
#include <asm/alternative.h>
#include <asm/assembler.h>
- .cpu generic+crc
+ .arch armv8-a+crc
.macro __crc32, c
cmp x2, #16
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 9f382adfa88a..e0bf83d556f2 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -24,36 +24,36 @@
* Returns:
* x0 - dest
*/
- .macro ldrb1 ptr, regB, val
- ldrb \ptr, [\regB], \val
+ .macro ldrb1 reg, ptr, val
+ ldrb \reg, [\ptr], \val
.endm
- .macro strb1 ptr, regB, val
- strb \ptr, [\regB], \val
+ .macro strb1 reg, ptr, val
+ strb \reg, [\ptr], \val
.endm
- .macro ldrh1 ptr, regB, val
- ldrh \ptr, [\regB], \val
+ .macro ldrh1 reg, ptr, val
+ ldrh \reg, [\ptr], \val
.endm
- .macro strh1 ptr, regB, val
- strh \ptr, [\regB], \val
+ .macro strh1 reg, ptr, val
+ strh \reg, [\ptr], \val
.endm
- .macro ldr1 ptr, regB, val
- ldr \ptr, [\regB], \val
+ .macro ldr1 reg, ptr, val
+ ldr \reg, [\ptr], \val
.endm
- .macro str1 ptr, regB, val
- str \ptr, [\regB], \val
+ .macro str1 reg, ptr, val
+ str \reg, [\ptr], \val
.endm
- .macro ldp1 ptr, regB, regC, val
- ldp \ptr, \regB, [\regC], \val
+ .macro ldp1 reg1, reg2, ptr, val
+ ldp \reg1, \reg2, [\ptr], \val
.endm
- .macro stp1 ptr, regB, regC, val
- stp \ptr, \regB, [\regC], \val
+ .macro stp1 reg1, reg2, ptr, val
+ stp \reg1, \reg2, [\ptr], \val
.endm
.weak memcpy
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 9b26f9a88724..d702d60e64da 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -92,6 +92,9 @@ static void set_reserved_asid_bits(void)
bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
}
+#define asid_gen_match(asid) \
+ (!(((asid) ^ atomic64_read(&asid_generation)) >> asid_bits))
+
static void flush_context(void)
{
int i;
@@ -220,8 +223,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
* because atomic RmWs are totally ordered for a given location.
*/
old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
- if (old_active_asid &&
- !((asid ^ atomic64_read(&asid_generation)) >> asid_bits) &&
+ if (old_active_asid && asid_gen_match(asid) &&
atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
old_active_asid, asid))
goto switch_mm_fastpath;
@@ -229,7 +231,7 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
raw_spin_lock_irqsave(&cpu_asid_lock, flags);
/* Check that our ASID belongs to the current generation. */
asid = atomic64_read(&mm->context.id);
- if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) {
+ if (!asid_gen_match(asid)) {
asid = new_context(mm);
atomic64_set(&mm->context.id, asid);
}
diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c
index 860c00ec8bd3..0da020c563e6 100644
--- a/arch/arm64/mm/dump.c
+++ b/arch/arm64/mm/dump.c
@@ -146,6 +146,11 @@ static const struct prot_bits pte_bits[] = {
.set = "UXN",
.clear = " ",
}, {
+ .mask = PTE_GP,
+ .val = PTE_GP,
+ .set = "GP",
+ .clear = " ",
+ }, {
.mask = PTE_ATTRINDX_MASK,
.val = PTE_ATTRINDX(MT_DEVICE_nGnRnE),
.set = "DEVICE/nGnRnE",
@@ -247,7 +252,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
}
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- unsigned long val)
+ u64 val)
{
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
static const char units[] = "KMGTPE";
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c9cedc0432d2..df8ae73d950b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -145,6 +145,7 @@ static void show_pte(unsigned long addr)
pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
do {
+ p4d_t *p4dp, p4d;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep, pte;
@@ -152,7 +153,13 @@ static void show_pte(unsigned long addr)
if (pgd_none(pgd) || pgd_bad(pgd))
break;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
+ pr_cont(", p4d=%016llx", p4d_val(p4d));
+ if (p4d_none(p4d) || p4d_bad(p4d))
+ break;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
pr_cont(", pud=%016llx", pud_val(pud));
if (pud_none(pud) || pud_bad(pud))
@@ -635,11 +642,13 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
inf = esr_to_fault_info(esr);
- /*
- * Return value ignored as we rely on signal merging.
- * Future patches will make this more robust.
- */
- apei_claim_sea(regs);
+ if (user_mode(regs) && apei_claim_sea(regs) == 0) {
+ /*
+ * APEI claimed this as a firmware-first notification.
+ * Some processing deferred to task_work before ret_to_user().
+ */
+ return 0;
+ }
if (esr & ESR_ELx_FnV)
siaddr = NULL;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bbeb6a5a6ba6..0a52ce46f020 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -67,11 +67,13 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, size_t *pgsize)
{
pgd_t *pgdp = pgd_offset(mm, addr);
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
*pgsize = PAGE_SIZE;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ pudp = pud_offset(p4dp, addr);
pmdp = pmd_offset(pudp, addr);
if ((pte_t *)pmdp == ptep) {
*pgsize = PMD_SIZE;
@@ -217,12 +219,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep = NULL;
pgdp = pgd_offset(mm, addr);
- pudp = pud_alloc(mm, pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ pudp = pud_alloc(mm, p4dp, addr);
if (!pudp)
return NULL;
@@ -230,6 +234,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
ptep = (pte_t *)pudp;
} else if (sz == (CONT_PTE_SIZE)) {
pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return NULL;
WARN_ON(addr & (sz - 1));
/*
@@ -259,6 +265,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
@@ -266,7 +273,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
if (!pgd_present(READ_ONCE(*pgdp)))
return NULL;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (!p4d_present(READ_ONCE(*p4dp)))
+ return NULL;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (sz != PUD_SIZE && pud_none(pud))
return NULL;
@@ -441,44 +452,30 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
}
-static void __init add_huge_page_size(unsigned long size)
-{
- if (size_to_hstate(size))
- return;
-
- hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
-}
-
static int __init hugetlbpage_init(void)
{
#ifdef CONFIG_ARM64_4K_PAGES
- add_huge_page_size(PUD_SIZE);
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
#endif
- add_huge_page_size(CONT_PMD_SIZE);
- add_huge_page_size(PMD_SIZE);
- add_huge_page_size(CONT_PTE_SIZE);
+ hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT);
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT);
return 0;
}
arch_initcall(hugetlbpage_init);
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
-
- switch (ps) {
+ switch (size) {
#ifdef CONFIG_ARM64_4K_PAGES
case PUD_SIZE:
#endif
case CONT_PMD_SIZE:
case PMD_SIZE:
case CONT_PTE_SIZE:
- add_huge_page_size(ps);
- return 1;
+ return true;
}
- hugetlb_bad_size();
- pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
- return 0;
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index e42727e3568e..e631e6425165 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -192,8 +192,6 @@ static phys_addr_t __init max_zone_phys(unsigned int zone_bits)
return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
}
-#ifdef CONFIG_NUMA
-
static void __init zone_sizes_init(unsigned long min, unsigned long max)
{
unsigned long max_zone_pfns[MAX_NR_ZONES] = {0};
@@ -206,61 +204,9 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
#endif
max_zone_pfns[ZONE_NORMAL] = max;
- free_area_init_nodes(max_zone_pfns);
-}
-
-#else
-
-static void __init zone_sizes_init(unsigned long min, unsigned long max)
-{
- struct memblock_region *reg;
- unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
- unsigned long __maybe_unused max_dma, max_dma32;
-
- memset(zone_size, 0, sizeof(zone_size));
-
- max_dma = max_dma32 = min;
-#ifdef CONFIG_ZONE_DMA
- max_dma = max_dma32 = PFN_DOWN(arm64_dma_phys_limit);
- zone_size[ZONE_DMA] = max_dma - min;
-#endif
-#ifdef CONFIG_ZONE_DMA32
- max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
- zone_size[ZONE_DMA32] = max_dma32 - max_dma;
-#endif
- zone_size[ZONE_NORMAL] = max - max_dma32;
-
- memcpy(zhole_size, zone_size, sizeof(zhole_size));
-
- for_each_memblock(memory, reg) {
- unsigned long start = memblock_region_memory_base_pfn(reg);
- unsigned long end = memblock_region_memory_end_pfn(reg);
-
-#ifdef CONFIG_ZONE_DMA
- if (start >= min && start < max_dma) {
- unsigned long dma_end = min(end, max_dma);
- zhole_size[ZONE_DMA] -= dma_end - start;
- start = dma_end;
- }
-#endif
-#ifdef CONFIG_ZONE_DMA32
- if (start >= max_dma && start < max_dma32) {
- unsigned long dma32_end = min(end, max_dma32);
- zhole_size[ZONE_DMA32] -= dma32_end - start;
- start = dma32_end;
- }
-#endif
- if (start >= max_dma32 && start < max) {
- unsigned long normal_end = min(end, max);
- zhole_size[ZONE_NORMAL] -= normal_end - start;
- }
- }
-
- free_area_init_node(0, zone_size, min, zhole_size);
+ free_area_init(max_zone_pfns);
}
-#endif /* CONFIG_NUMA */
-
int pfn_valid(unsigned long pfn)
{
phys_addr_t addr = pfn << PAGE_SHIFT;
@@ -272,7 +218,7 @@ int pfn_valid(unsigned long pfn)
if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
return 0;
- if (!valid_section(__nr_to_section(pfn_to_section_nr(pfn))))
+ if (!valid_section(__pfn_to_section(pfn)))
return 0;
#endif
return memblock_is_map_memory(addr);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index f87a32484ea8..2339811f317b 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -84,17 +84,17 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node,
return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr);
}
-static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node,
+static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node,
bool early)
{
- if (pgd_none(READ_ONCE(*pgdp))) {
+ if (p4d_none(READ_ONCE(*p4dp))) {
phys_addr_t pud_phys = early ?
__pa_symbol(kasan_early_shadow_pud)
: kasan_alloc_zeroed_page(node);
- __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE);
+ __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE);
}
- return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr);
+ return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr);
}
static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
@@ -126,11 +126,11 @@ static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
} while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp)));
}
-static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr,
+static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr,
unsigned long end, int node, bool early)
{
unsigned long next;
- pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early);
+ pud_t *pudp = kasan_pud_offset(p4dp, addr, node, early);
do {
next = pud_addr_end(addr, end);
@@ -138,6 +138,18 @@ static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr,
} while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp)));
}
+static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr,
+ unsigned long end, int node, bool early)
+{
+ unsigned long next;
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_pud_populate(p4dp, addr, next, node, early);
+ } while (p4dp++, addr = next, addr != end);
+}
+
static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
int node, bool early)
{
@@ -147,7 +159,7 @@ static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
pgdp = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- kasan_pud_populate(pgdp, addr, next, node, early);
+ kasan_p4d_populate(pgdp, addr, next, node, early);
} while (pgdp++, addr = next, addr != end);
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a374e4f51a62..e7fbc6275329 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -290,18 +290,19 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
{
unsigned long next;
pud_t *pudp;
- pgd_t pgd = READ_ONCE(*pgdp);
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+ p4d_t p4d = READ_ONCE(*p4dp);
- if (pgd_none(pgd)) {
+ if (p4d_none(p4d)) {
phys_addr_t pud_phys;
BUG_ON(!pgtable_alloc);
pud_phys = pgtable_alloc(PUD_SHIFT);
- __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
- pgd = READ_ONCE(*pgdp);
+ __p4d_populate(p4dp, pud_phys, PUD_TYPE_TABLE);
+ p4d = READ_ONCE(*p4dp);
}
- BUG_ON(pgd_bad(pgd));
+ BUG_ON(p4d_bad(p4d));
- pudp = pud_set_fixmap_offset(pgdp, addr);
+ pudp = pud_set_fixmap_offset(p4dp, addr);
do {
pud_t old_pud = READ_ONCE(*pudp);
@@ -610,6 +611,22 @@ core_initcall(map_entry_trampoline);
#endif
/*
+ * Open coded check for BTI, only for use to determine configuration
+ * for early mappings for before the cpufeature code has run.
+ */
+static bool arm64_early_this_cpu_has_bti(void)
+{
+ u64 pfr1;
+
+ if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
+ return false;
+
+ pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1);
+ return cpuid_feature_extract_unsigned_field(pfr1,
+ ID_AA64PFR1_BT_SHIFT);
+}
+
+/*
* Create fine-grained mappings for the kernel.
*/
static void __init map_kernel(pgd_t *pgdp)
@@ -625,6 +642,14 @@ static void __init map_kernel(pgd_t *pgdp)
pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
/*
+ * If we have a CPU that supports BTI and a kernel built for
+ * BTI then mark the kernel executable text as guarded pages
+ * now so we don't have to rewrite the page tables later.
+ */
+ if (arm64_early_this_cpu_has_bti())
+ text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
+
+ /*
* Only rodata will be remapped with different permissions later on,
* all other segments are allowed to use contiguous mappings.
*/
@@ -648,6 +673,7 @@ static void __init map_kernel(pgd_t *pgdp)
READ_ONCE(*pgd_offset_k(FIXADDR_START)));
} else if (CONFIG_PGTABLE_LEVELS > 3) {
pgd_t *bm_pgdp;
+ p4d_t *bm_p4dp;
pud_t *bm_pudp;
/*
* The fixmap shares its top level pgd entry with the kernel
@@ -657,7 +683,8 @@ static void __init map_kernel(pgd_t *pgdp)
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START);
- bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START);
+ bm_p4dp = p4d_offset(bm_pgdp, FIXADDR_START);
+ bm_pudp = pud_set_fixmap_offset(bm_p4dp, FIXADDR_START);
pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
pud_clear_fixmap();
} else {
@@ -691,6 +718,7 @@ void __init paging_init(void)
int kern_addr_valid(unsigned long addr)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep, pte;
@@ -702,7 +730,11 @@ int kern_addr_valid(unsigned long addr)
if (pgd_none(READ_ONCE(*pgdp)))
return 0;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_none(READ_ONCE(*p4dp)))
+ return 0;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return 0;
@@ -1045,6 +1077,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
unsigned long addr = start;
unsigned long next;
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
@@ -1055,7 +1088,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
if (!pgdp)
return -ENOMEM;
- pudp = vmemmap_pud_populate(pgdp, addr, node);
+ p4dp = vmemmap_p4d_populate(pgdp, addr, node);
+ if (!p4dp)
+ return -ENOMEM;
+
+ pudp = vmemmap_pud_populate(p4dp, addr, node);
if (!pudp)
return -ENOMEM;
@@ -1090,11 +1127,12 @@ void vmemmap_free(unsigned long start, unsigned long end,
static inline pud_t * fixmap_pud(unsigned long addr)
{
pgd_t *pgdp = pgd_offset_k(addr);
- pgd_t pgd = READ_ONCE(*pgdp);
+ p4d_t *p4dp = p4d_offset(pgdp, addr);
+ p4d_t p4d = READ_ONCE(*p4dp);
- BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
+ BUG_ON(p4d_none(p4d) || p4d_bad(p4d));
- return pud_offset_kimg(pgdp, addr);
+ return pud_offset_kimg(p4dp, addr);
}
static inline pmd_t * fixmap_pmd(unsigned long addr)
@@ -1120,25 +1158,27 @@ static inline pte_t * fixmap_pte(unsigned long addr)
*/
void __init early_fixmap_init(void)
{
- pgd_t *pgdp, pgd;
+ pgd_t *pgdp;
+ p4d_t *p4dp, p4d;
pud_t *pudp;
pmd_t *pmdp;
unsigned long addr = FIXADDR_START;
pgdp = pgd_offset_k(addr);
- pgd = READ_ONCE(*pgdp);
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = READ_ONCE(*p4dp);
if (CONFIG_PGTABLE_LEVELS > 3 &&
- !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
+ !(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
* 16k/4 levels configurations.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
- pudp = pud_offset_kimg(pgdp, addr);
+ pudp = pud_offset_kimg(p4dp, addr);
} else {
- if (pgd_none(pgd))
- __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
+ if (p4d_none(p4d))
+ __p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
pudp = fixmap_pud(addr);
}
if (pud_none(READ_ONCE(*pudp)))
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 4decf1659700..aafcee3e3f7e 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -350,13 +350,16 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
- for_each_memblock(memory, mblk)
- if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
+ for_each_memblock(memory, mblk) {
+ int mblk_nid = memblock_get_region_node(mblk);
+
+ if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
- mblk->nid, mblk->base,
+ mblk_nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
+ }
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 250c49008d73..4175bcb8ccb3 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -126,13 +126,13 @@ int set_memory_nx(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
__pgprot(PTE_PXN),
- __pgprot(0));
+ __pgprot(PTE_MAYBE_GP));
}
int set_memory_x(unsigned long addr, int numpages)
{
return change_memory_common(addr, numpages,
- __pgprot(0),
+ __pgprot(PTE_MAYBE_GP),
__pgprot(PTE_PXN));
}
@@ -198,6 +198,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
bool kernel_page_present(struct page *page)
{
pgd_t *pgdp;
+ p4d_t *p4dp;
pud_t *pudp, pud;
pmd_t *pmdp, pmd;
pte_t *ptep;
@@ -210,7 +211,11 @@ bool kernel_page_present(struct page *page)
if (pgd_none(READ_ONCE(*pgdp)))
return false;
- pudp = pud_offset(pgdp, addr);
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_none(READ_ONCE(*p4dp)))
+ return false;
+
+ pudp = pud_offset(p4dp, addr);
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 197a9ba2d5ea..b7bebb12a56d 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -58,6 +58,8 @@
* cpu_do_suspend - save CPU registers context
*
* x0: virtual address of context pointer
+ *
+ * This must be kept in sync with struct cpu_suspend_ctx in <asm/suspend.h>.
*/
SYM_FUNC_START(cpu_do_suspend)
mrs x2, tpidr_el0
@@ -82,6 +84,11 @@ alternative_endif
stp x8, x9, [x0, #48]
stp x10, x11, [x0, #64]
stp x12, x13, [x0, #80]
+ /*
+ * Save x18 as it may be used as a platform register, e.g. by shadow
+ * call stack.
+ */
+ str x18, [x0, #96]
ret
SYM_FUNC_END(cpu_do_suspend)
@@ -98,6 +105,13 @@ SYM_FUNC_START(cpu_do_resume)
ldp x9, x10, [x0, #48]
ldp x11, x12, [x0, #64]
ldp x13, x14, [x0, #80]
+ /*
+ * Restore x18, as it may be used as a platform register, and clear
+ * the buffer to minimize the risk of exposure when used for shadow
+ * call stack.
+ */
+ ldr x18, [x0, #96]
+ str xzr, [x0, #96]
msr tpidr_el0, x2
msr tpidrro_el0, x3
msr contextidr_el1, x4
@@ -139,7 +153,7 @@ alternative_if ARM64_HAS_RAS_EXTN
msr_s SYS_DISR_EL1, xzr
alternative_else_nop_endif
- ptrauth_keys_install_kernel x14, 0, x1, x2, x3
+ ptrauth_keys_install_kernel_nosync x14, x1, x2, x3
isb
ret
SYM_FUNC_END(cpu_do_resume)
@@ -386,8 +400,6 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings)
*
* Initialise the processor for turning the MMU on.
*
- * Input:
- * x0 with a flag ARM64_CPU_BOOT_PRIMARY/ARM64_CPU_BOOT_SECONDARY/ARM64_CPU_RUNTIME.
* Output:
* Return in x0 the value of the SCTLR_EL1 register.
*/
@@ -446,51 +458,9 @@ SYM_FUNC_START(__cpu_setup)
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr tcr_el1, x10
- mov x1, x0
/*
* Prepare SCTLR
*/
mov_q x0, SCTLR_EL1_SET
-
-#ifdef CONFIG_ARM64_PTR_AUTH
- /* No ptrauth setup for run time cpus */
- cmp x1, #ARM64_CPU_RUNTIME
- b.eq 3f
-
- /* Check if the CPU supports ptrauth */
- mrs x2, id_aa64isar1_el1
- ubfx x2, x2, #ID_AA64ISAR1_APA_SHIFT, #8
- cbz x2, 3f
-
- /*
- * The primary cpu keys are reset here and can be
- * re-initialised with some proper values later.
- */
- msr_s SYS_APIAKEYLO_EL1, xzr
- msr_s SYS_APIAKEYHI_EL1, xzr
-
- /* Just enable ptrauth for primary cpu */
- cmp x1, #ARM64_CPU_BOOT_PRIMARY
- b.eq 2f
-
- /* if !system_supports_address_auth() then skip enable */
-alternative_if_not ARM64_HAS_ADDRESS_AUTH
- b 3f
-alternative_else_nop_endif
-
- /* Install ptrauth key for secondary cpus */
- adr_l x2, secondary_data
- ldr x3, [x2, #CPU_BOOT_TASK] // get secondary_data.task
- cbz x3, 2f // check for slow booting cpus
- ldp x3, x4, [x2, #CPU_BOOT_PTRAUTH_KEY]
- msr_s SYS_APIAKEYLO_EL1, x3
- msr_s SYS_APIAKEYHI_EL1, x4
-
-2: /* Enable ptrauth instructions */
- ldr x2, =SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
- SCTLR_ELx_ENDA | SCTLR_ELx_ENDB
- orr x0, x0, x2
-3:
-#endif
ret // return to head.S
SYM_FUNC_END(__cpu_setup)
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index eb73f9f72c46..cc0cf0f5c7c3 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -100,6 +100,14 @@
/* Rd = Rn OP imm12 */
#define A64_ADD_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD)
#define A64_SUB_I(sf, Rd, Rn, imm12) A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB)
+#define A64_ADDS_I(sf, Rd, Rn, imm12) \
+ A64_ADDSUB_IMM(sf, Rd, Rn, imm12, ADD_SETFLAGS)
+#define A64_SUBS_I(sf, Rd, Rn, imm12) \
+ A64_ADDSUB_IMM(sf, Rd, Rn, imm12, SUB_SETFLAGS)
+/* Rn + imm12; set condition flags */
+#define A64_CMN_I(sf, Rn, imm12) A64_ADDS_I(sf, A64_ZR, Rn, imm12)
+/* Rn - imm12; set condition flags */
+#define A64_CMP_I(sf, Rn, imm12) A64_SUBS_I(sf, A64_ZR, Rn, imm12)
/* Rd = Rn */
#define A64_MOV(sf, Rd, Rn) A64_ADD_I(sf, Rd, Rn, 0)
@@ -189,4 +197,26 @@
/* Rn & Rm; set condition flags */
#define A64_TST(sf, Rn, Rm) A64_ANDS(sf, A64_ZR, Rn, Rm)
+/* Logical (immediate) */
+#define A64_LOGIC_IMM(sf, Rd, Rn, imm, type) ({ \
+ u64 imm64 = (sf) ? (u64)imm : (u64)(u32)imm; \
+ aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_##type, \
+ A64_VARIANT(sf), Rn, Rd, imm64); \
+})
+/* Rd = Rn OP imm */
+#define A64_AND_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND)
+#define A64_ORR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, ORR)
+#define A64_EOR_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, EOR)
+#define A64_ANDS_I(sf, Rd, Rn, imm) A64_LOGIC_IMM(sf, Rd, Rn, imm, AND_SETFLAGS)
+/* Rn & imm; set condition flags */
+#define A64_TST_I(sf, Rn, imm) A64_ANDS_I(sf, A64_ZR, Rn, imm)
+
+/* HINTs */
+#define A64_HINT(x) aarch64_insn_gen_hint(x)
+
+/* BTI */
+#define A64_BTI_C A64_HINT(AARCH64_INSN_HINT_BTIC)
+#define A64_BTI_J A64_HINT(AARCH64_INSN_HINT_BTIJ)
+#define A64_BTI_JC A64_HINT(AARCH64_INSN_HINT_BTIJC)
+
#endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index cdc79de0c794..3cb25b43b368 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -167,11 +167,21 @@ static inline int epilogue_offset(const struct jit_ctx *ctx)
return to - from;
}
+static bool is_addsub_imm(u32 imm)
+{
+ /* Either imm12 or shifted imm12. */
+ return !(imm & ~0xfff) || !(imm & ~0xfff000);
+}
+
/* Stack must be multiples of 16B */
#define STACK_ALIGN(sz) (((sz) + 15) & ~15)
/* Tail call offset to jump into */
+#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
+#define PROLOGUE_OFFSET 8
+#else
#define PROLOGUE_OFFSET 7
+#endif
static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
{
@@ -208,6 +218,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
*
*/
+ /* BTI landing pad */
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
+ emit(A64_BTI_C, ctx);
+
/* Save FP and LR registers to stay align with ARM64 AAPCS */
emit(A64_PUSH(A64_FP, A64_LR, A64_SP), ctx);
emit(A64_MOV(1, A64_FP, A64_SP), ctx);
@@ -230,6 +244,10 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
cur_offset, PROLOGUE_OFFSET);
return -1;
}
+
+ /* BTI landing pad for the tail call, done with a BR */
+ if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
+ emit(A64_BTI_J, ctx);
}
ctx->stack_size = STACK_ALIGN(prog->aux->stack_depth);
@@ -356,6 +374,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
const bool isdw = BPF_SIZE(code) == BPF_DW;
u8 jmp_cond, reg;
s32 jmp_offset;
+ u32 a64_insn;
#define check_imm(bits, imm) do { \
if ((((imm) > 0) && ((imm) >> (bits))) || \
@@ -478,28 +497,55 @@ emit_bswap_uxt:
/* dst = dst OP imm */
case BPF_ALU | BPF_ADD | BPF_K:
case BPF_ALU64 | BPF_ADD | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_ADD(is64, dst, dst, tmp), ctx);
+ if (is_addsub_imm(imm)) {
+ emit(A64_ADD_I(is64, dst, dst, imm), ctx);
+ } else if (is_addsub_imm(-imm)) {
+ emit(A64_SUB_I(is64, dst, dst, -imm), ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_ADD(is64, dst, dst, tmp), ctx);
+ }
break;
case BPF_ALU | BPF_SUB | BPF_K:
case BPF_ALU64 | BPF_SUB | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_SUB(is64, dst, dst, tmp), ctx);
+ if (is_addsub_imm(imm)) {
+ emit(A64_SUB_I(is64, dst, dst, imm), ctx);
+ } else if (is_addsub_imm(-imm)) {
+ emit(A64_ADD_I(is64, dst, dst, -imm), ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_SUB(is64, dst, dst, tmp), ctx);
+ }
break;
case BPF_ALU | BPF_AND | BPF_K:
case BPF_ALU64 | BPF_AND | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_AND(is64, dst, dst, tmp), ctx);
+ a64_insn = A64_AND_I(is64, dst, dst, imm);
+ if (a64_insn != AARCH64_BREAK_FAULT) {
+ emit(a64_insn, ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_AND(is64, dst, dst, tmp), ctx);
+ }
break;
case BPF_ALU | BPF_OR | BPF_K:
case BPF_ALU64 | BPF_OR | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_ORR(is64, dst, dst, tmp), ctx);
+ a64_insn = A64_ORR_I(is64, dst, dst, imm);
+ if (a64_insn != AARCH64_BREAK_FAULT) {
+ emit(a64_insn, ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_ORR(is64, dst, dst, tmp), ctx);
+ }
break;
case BPF_ALU | BPF_XOR | BPF_K:
case BPF_ALU64 | BPF_XOR | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_EOR(is64, dst, dst, tmp), ctx);
+ a64_insn = A64_EOR_I(is64, dst, dst, imm);
+ if (a64_insn != AARCH64_BREAK_FAULT) {
+ emit(a64_insn, ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_EOR(is64, dst, dst, tmp), ctx);
+ }
break;
case BPF_ALU | BPF_MUL | BPF_K:
case BPF_ALU64 | BPF_MUL | BPF_K:
@@ -623,13 +669,24 @@ emit_cond_jmp:
case BPF_JMP32 | BPF_JSLT | BPF_K:
case BPF_JMP32 | BPF_JSGE | BPF_K:
case BPF_JMP32 | BPF_JSLE | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_CMP(is64, dst, tmp), ctx);
+ if (is_addsub_imm(imm)) {
+ emit(A64_CMP_I(is64, dst, imm), ctx);
+ } else if (is_addsub_imm(-imm)) {
+ emit(A64_CMN_I(is64, dst, -imm), ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_CMP(is64, dst, tmp), ctx);
+ }
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_K:
case BPF_JMP32 | BPF_JSET | BPF_K:
- emit_a64_mov_i(is64, tmp, imm, ctx);
- emit(A64_TST(is64, dst, tmp), ctx);
+ a64_insn = A64_TST_I(is64, dst, imm);
+ if (a64_insn != AARCH64_BREAK_FAULT) {
+ emit(a64_insn, ctx);
+ } else {
+ emit_a64_mov_i(is64, tmp, imm, ctx);
+ emit(A64_TST(is64, dst, tmp), ctx);
+ }
goto emit_cond_jmp;
/* function call */
case BPF_JMP | BPF_CALL: