summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig93
-rw-r--r--arch/x86/Kconfig.assembler10
-rw-r--r--arch/x86/Makefile9
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S29
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c2
-rw-r--r--arch/x86/boot/compressed/misc.c1
-rw-r--r--arch/x86/boot/compressed/sev.c197
-rw-r--r--arch/x86/boot/main.c4
-rw-r--r--arch/x86/coco/core.c93
-rw-r--r--arch/x86/configs/hardening.config3
-rw-r--r--arch/x86/configs/tiny.config1
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S845
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S467
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c400
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S1
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S1
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S253
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S1
-rw-r--r--arch/x86/entry/Makefile2
-rw-r--r--arch/x86/entry/common.c75
-rw-r--r--arch/x86/entry/entry_64.S61
-rw-r--r--arch/x86/entry/entry_64_compat.S17
-rw-r--r--arch/x86/entry/entry_fred.c12
-rw-r--r--arch/x86/entry/syscall_32.c21
-rw-r--r--arch/x86/entry/syscall_64.c19
-rw-r--r--arch/x86/entry/syscall_x32.c10
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/thunk.S (renamed from arch/x86/entry/thunk_64.S)0
-rw-r--r--arch/x86/entry/thunk_32.S18
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c28
-rw-r--r--arch/x86/events/amd/core.c77
-rw-r--r--arch/x86/events/amd/lbr.c25
-rw-r--r--arch/x86/events/core.c1
-rw-r--r--arch/x86/events/intel/cstate.c144
-rw-r--r--arch/x86/events/intel/ds.c8
-rw-r--r--arch/x86/events/intel/lbr.c4
-rw-r--r--arch/x86/events/intel/pt.c12
-rw-r--r--arch/x86/events/intel/uncore.c100
-rw-r--r--arch/x86/events/intel/uncore_nhmex.c3
-rw-r--r--arch/x86/events/intel/uncore_snbep.c5
-rw-r--r--arch/x86/events/msr.c132
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/events/rapl.c7
-rw-r--r--arch/x86/hyperv/hv_apic.c16
-rw-r--r--arch/x86/hyperv/hv_init.c8
-rw-r--r--arch/x86/hyperv/hv_proc.c22
-rw-r--r--arch/x86/hyperv/hv_spinlock.c3
-rw-r--r--arch/x86/hyperv/hv_vtl.c20
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/alternative.h32
-rw-r--r--arch/x86/include/asm/apic.h11
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/asm.h3
-rw-r--r--arch/x86/include/asm/atomic.h12
-rw-r--r--arch/x86/include/asm/atomic64_32.h81
-rw-r--r--arch/x86/include/asm/atomic64_64.h12
-rw-r--r--arch/x86/include/asm/barrier.h3
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h205
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h8
-rw-r--r--arch/x86/include/asm/coco.h3
-rw-r--r--arch/x86/include/asm/cpu_device_id.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h24
-rw-r--r--arch/x86/include/asm/cpufeatures.h15
-rw-r--r--arch/x86/include/asm/crash_reserve.h (renamed from arch/x86/include/asm/crash_core.h)8
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/e820/api.h1
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h2
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h145
-rw-r--r--arch/x86/include/asm/ia32.h11
-rw-r--r--arch/x86/include/asm/ia32_unistd.h12
-rw-r--r--arch/x86/include/asm/idtentry.h8
-rw-r--r--arch/x86/include/asm/intel-family.h85
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_remapping.h7
-rw-r--r--arch/x86/include/asm/irq_stack.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h10
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h29
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mmu.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h30
-rw-r--r--arch/x86/include/asm/msr-index.h18
-rw-r--r--arch/x86/include/asm/nospec-branch.h38
-rw-r--r--arch/x86/include/asm/page_types.h8
-rw-r--r--arch/x86/include/asm/percpu.h157
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/pgtable.h37
-rw-r--r--arch/x86/include/asm/pgtable_types.h5
-rw-r--r--arch/x86/include/asm/posted_intr.h118
-rw-r--r--arch/x86/include/asm/processor.h33
-rw-r--r--arch/x86/include/asm/prom.h9
-rw-r--r--arch/x86/include/asm/qspinlock.h13
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h7
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/seccomp.h2
-rw-r--r--arch/x86/include/asm/sev.h12
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/special_insns.h8
-rw-r--r--arch/x86/include/asm/string_64.h45
-rw-r--r--arch/x86/include/asm/suspend_32.h10
-rw-r--r--arch/x86/include/asm/svm.h8
-rw-r--r--arch/x86/include/asm/syscall.h11
-rw-r--r--arch/x86/include/asm/text-patching.h2
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h44
-rw-r--r--arch/x86/include/asm/vm86.h2
-rw-r--r--arch/x86/include/asm/vmxfeatures.h1
-rw-r--r--arch/x86/include/asm/x86_init.h3
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h5
-rw-r--r--arch/x86/include/uapi/asm/kvm.h308
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h3
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/cppc.c2
-rw-r--r--arch/x86/kernel/acpi/cstate.c4
-rw-r--r--arch/x86/kernel/alternative.c135
-rw-r--r--arch/x86/kernel/amd_nb.c1
-rw-r--r--arch/x86/kernel/apic/apic.c68
-rw-r--r--arch/x86/kernel/apic/vector.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c7
-rw-r--r--arch/x86/kernel/callthunks.c11
-rw-r--r--arch/x86/kernel/cpu/amd.c78
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c17
-rw-r--r--arch/x86/kernel/cpu/bugs.c200
-rw-r--r--arch/x86/kernel/cpu/common.c253
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c9
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c12
-rw-r--r--arch/x86/kernel/cpu/match.c5
-rw-r--r--arch/x86/kernel/cpu/mce/core.c28
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c40
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c21
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c27
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c103
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c65
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c40
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h8
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c24
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c12
-rw-r--r--arch/x86/kernel/cpu/resctrl/trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)24
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c18
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c86
-rw-r--r--arch/x86/kernel/cpu/topology_common.c12
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c15
-rw-r--r--arch/x86/kernel/devicetree.c48
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/e820.c24
-rw-r--r--arch/x86/kernel/eisa.c3
-rw-r--r--arch/x86/kernel/fpu/core.c4
-rw-r--r--arch/x86/kernel/fpu/xstate.c9
-rw-r--r--arch/x86/kernel/fpu/xstate.h14
-rw-r--r--arch/x86/kernel/head64.c18
-rw-r--r--arch/x86/kernel/head_32.S13
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/idt.c5
-rw-r--r--arch/x86/kernel/irq.c176
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kprobes/common.h2
-rw-r--r--arch/x86/kernel/kprobes/core.c109
-rw-r--r--arch/x86/kernel/kvm.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/nmi.c24
-rw-r--r--arch/x86/kernel/probe_roms.c10
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/rtc.c1
-rw-r--r--arch/x86/kernel/setup.c57
-rw-r--r--arch/x86/kernel/sev-shared.c6
-rw-r--r--arch/x86/kernel/sev.c51
-rw-r--r--arch/x86/kernel/shstk.c4
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c6
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smpboot.c52
-rw-r--r--arch/x86/kernel/topology.c43
-rw-r--r--arch/x86/kernel/tsc.c8
-rw-r--r--arch/x86/kernel/tsc_msr.c14
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/vmcore_info_32.c (renamed from arch/x86/kernel/crash_core_32.c)2
-rw-r--r--arch/x86/kernel/vmcore_info_64.c (renamed from arch/x86/kernel/crash_core_64.c)2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S10
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/Kconfig7
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c43
-rw-r--r--arch/x86/kvm/cpuid.h10
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/emulate.c47
-rw-r--r--arch/x86/kvm/kvm_emulate.h4
-rw-r--r--arch/x86/kvm/lapic.c35
-rw-r--r--arch/x86/kvm/mmu/mmu.c54
-rw-r--r--arch/x86/kvm/mmu/page_track.c68
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c175
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/pmu.c173
-rw-r--r--arch/x86/kvm/pmu.h57
-rw-r--r--arch/x86/kvm/reverse_cpuid.h5
-rw-r--r--arch/x86/kvm/smm.c15
-rw-r--r--arch/x86/kvm/svm/pmu.c22
-rw-r--r--arch/x86/kvm/svm/sev.c62
-rw-r--r--arch/x86/kvm/svm/svm.c42
-rw-r--r--arch/x86/kvm/svm/svm.h3
-rw-r--r--arch/x86/kvm/svm/vmenter.S97
-rw-r--r--arch/x86/kvm/trace.h19
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c222
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c4
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h93
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c201
-rw-r--r--arch/x86/kvm/vmx/vmx.h10
-rw-r--r--arch/x86/kvm/x86.c232
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/kvm/xen.c315
-rw-r--r--arch/x86/kvm/xen.h18
-rw-r--r--arch/x86/lib/retpoline.S22
-rw-r--r--arch/x86/math-emu/fpu_etc.c9
-rw-r--r--arch/x86/math-emu/fpu_trig.c6
-rw-r--r--arch/x86/math-emu/reg_constant.c7
-rw-r--r--arch/x86/mm/dump_pagetables.c24
-rw-r--r--arch/x86/mm/extable.c9
-rw-r--r--arch/x86/mm/fault.c56
-rw-r--r--arch/x86/mm/ident_map.c23
-rw-r--r--arch/x86/mm/init.c16
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c16
-rw-r--r--arch/x86/mm/kasan_init_64.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c7
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c18
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c6
-rw-r--r--arch/x86/mm/numa.c4
-rw-r--r--arch/x86/mm/numa_32.c1
-rw-r--r--arch/x86/mm/pat/memtype.c49
-rw-r--r--arch/x86/mm/pat/set_memory.c82
-rw-r--r--arch/x86/mm/pgtable.c4
-rw-r--r--arch/x86/mm/pti.c10
-rw-r--r--arch/x86/mm/tlb.c39
-rw-r--r--arch/x86/net/bpf_jit_comp.c184
-rw-r--r--arch/x86/net/bpf_jit_comp32.c3
-rw-r--r--arch/x86/pci/ce4100.c6
-rw-r--r--arch/x86/pci/fixup.c48
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c54
-rw-r--r--arch/x86/platform/ce4100/ce4100.c1
-rw-r--r--arch/x86/platform/iris/iris.c5
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-pm.c7
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c5
-rw-r--r--arch/x86/platform/pvh/enlighten.c3
-rw-r--r--arch/x86/power/Makefile2
-rw-r--r--arch/x86/power/hibernate.c6
-rw-r--r--arch/x86/purgatory/Makefile3
-rw-r--r--arch/x86/tools/relocs.c371
-rw-r--r--arch/x86/virt/Makefile2
-rw-r--r--arch/x86/virt/svm/sev.c62
-rw-r--r--arch/x86/virt/vmx/tdx/tdx.c1
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c35
-rw-r--r--arch/x86/xen/enlighten_hvm.c4
-rw-r--r--arch/x86/xen/enlighten_pv.c11
-rw-r--r--arch/x86/xen/enlighten_pvh.c68
-rw-r--r--arch/x86/xen/mmu_pv.c14
-rw-r--r--arch/x86/xen/setup.c44
-rw-r--r--arch/x86/xen/smp_pv.c4
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h14
279 files changed, 6643 insertions, 4031 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 6a1f36df6a18..cf0ad89f5639 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -28,7 +28,7 @@ obj-y += net/
obj-$(CONFIG_KEXEC_FILE) += purgatory/
-obj-y += virt/svm/
+obj-y += virt/
# for cleaning
subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 89afacce6951..9d16fee6bdb8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
select ACPI_HOTPLUG_CPU if ACPI_PROCESSOR && HOTPLUG_CPU
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_INIT
+ select ARCH_CONFIGURES_CPU_MITIGATIONS
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64
@@ -168,6 +169,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
+ select GENERIC_VDSO_OVERFLOW_PROTECT
select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
@@ -245,7 +247,6 @@ config X86
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_RETHOOK
- select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
@@ -465,6 +466,17 @@ config X86_X2APIC
If you don't know what to do here, say N.
+config X86_POSTED_MSI
+ bool "Enable MSI and MSI-x delivery by posted interrupts"
+ depends on X86_64 && IRQ_REMAP
+ help
+ This enables MSIs that are under interrupt remapping to be delivered as
+ posted interrupts to the host kernel. Interrupt throughput can
+ potentially be improved by coalescing CPU notifications during high
+ frequency bursts.
+
+ If you don't know what to do here, say N.
+
config X86_MPPARSE
bool "Enable MPS table" if ACPI
default y
@@ -473,10 +485,6 @@ config X86_MPPARSE
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-config GOLDFISH
- def_bool y
- depends on X86_GOLDFISH
-
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
@@ -505,12 +513,11 @@ config X86_FRED
When enabled, try to use Flexible Return and Event Delivery
instead of the legacy SYSCALL/SYSENTER/IDT architecture for
ring transitions and exception/interrupt handling if the
- system supports.
+ system supports it.
-if X86_32
config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
- depends on SMP
+ depends on SMP && X86_32
help
This option is needed for the systems that have more than 8 CPUs.
@@ -523,7 +530,10 @@ config X86_EXTENDED_PLATFORM
systems out there.)
If you enable this option then you'll be able to select support
- for the following (non-PC) 32 bit x86 platforms:
+ for the following non-PC x86 platforms, depending on the value of
+ CONFIG_64BIT.
+
+ 32-bit platforms (CONFIG_64BIT=n):
Goldfish (Android emulator)
AMD Elan
RDC R-321x SoC
@@ -531,28 +541,14 @@ config X86_EXTENDED_PLATFORM
STA2X11-based (e.g. Northville)
Moorestown MID devices
- If you have one of these systems, or if you want to build a
- generic distribution kernel, say Y here - otherwise say N.
-endif # X86_32
-
-if X86_64
-config X86_EXTENDED_PLATFORM
- bool "Support for extended (non-PC) x86 platforms"
- default y
- help
- If you disable this option then the kernel will only support
- standard PC platforms. (which covers the vast majority of
- systems out there.)
-
- If you enable this option then you'll be able to select support
- for the following (non-PC) 64 bit x86 platforms:
+ 64-bit platforms (CONFIG_64BIT=y):
Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
If you have one of these systems, or if you want to build a
generic distribution kernel, say Y here - otherwise say N.
-endif # X86_64
+
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
config X86_NUMACHIP
@@ -1065,8 +1061,9 @@ config SCHED_MC
config SCHED_MC_PRIO
bool "CPU core priorities scheduler support"
- depends on SCHED_MC && CPU_SUP_INTEL
- select X86_INTEL_PSTATE
+ depends on SCHED_MC
+ select X86_INTEL_PSTATE if CPU_SUP_INTEL
+ select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI
select CPU_FREQ
default y
help
@@ -2104,7 +2101,7 @@ config ARCH_SUPPORTS_CRASH_HOTPLUG
def_bool y
config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
- def_bool CRASH_CORE
+ def_bool CRASH_RESERVE
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
@@ -2433,16 +2430,20 @@ source "kernel/livepatch/Kconfig"
endmenu
config CC_HAS_NAMED_AS
- def_bool CC_IS_GCC && GCC_VERSION >= 120100
+ def_bool CC_IS_GCC && GCC_VERSION >= 90100
+
+config CC_HAS_NAMED_AS_FIXED_SANITIZERS
+ def_bool CC_IS_GCC && GCC_VERSION >= 130300
config USE_X86_SEG_SUPPORT
def_bool y
depends on CC_HAS_NAMED_AS
#
- # -fsanitize=kernel-address (KASAN) is at the moment incompatible
- # with named address spaces - see GCC PR sanitizer/111736.
+ # -fsanitize=kernel-address (KASAN) and -fsanitize=thread
+ # (KCSAN) are incompatible with named address spaces with
+ # GCC < 13.3 - see GCC PR sanitizer/111736.
#
- depends on !KASAN
+ depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
@@ -2490,17 +2491,21 @@ config PREFIX_SYMBOLS
def_bool y
depends on CALL_PADDING && !CFI_CLANG
-menuconfig SPECULATION_MITIGATIONS
- bool "Mitigations for speculative execution vulnerabilities"
+menuconfig CPU_MITIGATIONS
+ bool "Mitigations for CPU vulnerabilities"
default y
help
- Say Y here to enable options which enable mitigations for
- speculative execution hardware vulnerabilities.
+ Say Y here to enable options which enable mitigations for hardware
+ vulnerabilities (usually related to speculative execution).
+ Mitigations can be disabled or restricted to SMT systems at runtime
+ via the "mitigations" kernel parameter.
- If you say N, all mitigations will be disabled. You really
- should know what you are doing to say so.
+ If you say N, all mitigations will be disabled. This CANNOT be
+ overridden at runtime.
-if SPECULATION_MITIGATIONS
+ Say 'Y', unless you really know what you are doing.
+
+if CPU_MITIGATIONS
config MITIGATION_PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
@@ -2635,6 +2640,16 @@ config MITIGATION_RFDS
stored in floating point, vector and integer registers.
See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst>
+config MITIGATION_SPECTRE_BHI
+ bool "Mitigate Spectre-BHB (Branch History Injection)"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable BHI mitigations. BHI attacks are a form of Spectre V2 attacks
+ where the branch history buffer is poisoned to speculatively steer
+ indirect branches.
+ See <file:Documentation/admin-guide/hw-vuln/spectre.rst>
+
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 8ad41da301e5..59aedf32c4ea 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -25,6 +25,16 @@ config AS_GFNI
help
Supported by binutils >= 2.30 and LLVM integrated assembler
+config AS_VAES
+ def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
+
+config AS_VPCLMULQDQ
+ def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
+
config AS_WRUSS
def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
help
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1eccc2ee45fb..5ab93fcdd691 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -71,6 +71,7 @@ export BITS
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
ifeq ($(CONFIG_X86_KERNEL_IBT),y)
@@ -220,12 +221,6 @@ endif
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
-ifdef CONFIG_LTO_CLANG
-ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
-KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
-endif
-endif
-
ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs --discard-none
else
@@ -256,8 +251,6 @@ archheaders:
libs-y += arch/x86/lib/
-core-y += arch/x86/virt/
-
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f19c038409aa..e9522c6893be 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -84,7 +84,7 @@ LDFLAGS_vmlinux += -T
hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__start_rodata\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
quiet_cmd_voffset = VOFFSET $@
cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
index f4e22ef774ab..876fc6d46a13 100644
--- a/arch/x86/boot/compressed/efi_mixed.S
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -15,10 +15,12 @@
*/
#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/page_types.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
+#include <asm/setup.h>
.code64
.text
@@ -49,6 +51,11 @@ SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
+
+ /* Switch to the firmware's stack */
+ movl efi32_boot_sp(%rip), %esp
+ andl $~7, %esp
+
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
@@ -144,6 +151,7 @@ SYM_FUNC_END(__efi64_thunk)
SYM_FUNC_START(efi32_stub_entry)
call 1f
1: popl %ecx
+ leal (efi32_boot_args - 1b)(%ecx), %ebx
/* Clear BSS */
xorl %eax, %eax
@@ -158,6 +166,7 @@ SYM_FUNC_START(efi32_stub_entry)
popl %ecx
popl %edx
popl %esi
+ movl %esi, 8(%ebx)
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
@@ -234,8 +243,6 @@ SYM_FUNC_END(efi_enter32)
*
* Arguments: %ecx image handle
* %edx EFI system table pointer
- * %esi struct bootparams pointer (or NULL when not using
- * the EFI handover protocol)
*
* Since this is the point of no return for ordinary execution, no registers
* are considered live except for the function parameters. [Note that the EFI
@@ -254,13 +261,25 @@ SYM_FUNC_START_LOCAL(efi32_entry)
/* Store firmware IDT descriptor */
sidtl (efi32_boot_idt - 1b)(%ebx)
+ /* Store firmware stack pointer */
+ movl %esp, (efi32_boot_sp - 1b)(%ebx)
+
/* Store boot arguments */
leal (efi32_boot_args - 1b)(%ebx), %ebx
movl %ecx, 0(%ebx)
movl %edx, 4(%ebx)
- movl %esi, 8(%ebx)
movb $0x0, 12(%ebx) // efi_is64
+ /*
+ * Allocate some memory for a temporary struct boot_params, which only
+ * needs the minimal pieces that startup_32() relies on.
+ */
+ subl $PARAM_SIZE, %esp
+ movl %esp, %esi
+ movl $PAGE_SIZE, BP_kernel_alignment(%esi)
+ movl $_end - 1b, BP_init_size(%esi)
+ subl $startup_32 - 1b, BP_init_size(%esi)
+
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
@@ -286,8 +305,7 @@ SYM_FUNC_START(efi32_pe_entry)
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
- xorl %esi, %esi
- jmp efi32_entry // pass %ecx, %edx, %esi
+ jmp efi32_entry // pass %ecx, %edx
// no other registers remain live
2: popl %edi // restore callee-save registers
@@ -318,5 +336,6 @@ SYM_DATA_END(efi32_boot_idt)
SYM_DATA_LOCAL(efi32_boot_cs, .word 0)
SYM_DATA_LOCAL(efi32_boot_ds, .word 0)
+SYM_DATA_LOCAL(efi32_boot_sp, .long 0)
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index bf4a10a5794f..1dcb794c5479 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -398,6 +398,11 @@ SYM_CODE_START(startup_64)
call sev_enable
#endif
+ /* Preserve only the CR4 bits that must be preserved, and clear the rest */
+ movq %cr4, %rax
+ andl $(X86_CR4_PAE | X86_CR4_MCE | X86_CR4_LA57), %eax
+ movq %rax, %cr4
+
/*
* configure_5level_paging() updates the number of paging levels using
* a trampoline in 32-bit addressable memory if the current number does
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index 909f2a35b60c..dfb9c2deb77c 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -284,7 +284,7 @@ static int set_clr_page_flags(struct x86_mapping_info *info,
pudp = pud_offset(p4dp, address);
pmdp = pmd_offset(pudp, address);
- if (pmd_large(*pmdp))
+ if (pmd_leaf(*pmdp))
ptep = split_large_pmd(info, pmdp, address);
else
ptep = pte_offset_kernel(pmdp, address);
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 4535242cc1b1..b70e4a21c15f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -344,6 +344,7 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
+const unsigned long kernel_text_size = VO___start_rodata - VO__text;
const unsigned long kernel_total_size = VO__end - VO__text;
static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index ec71846d28c9..0457a9d7e515 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -335,26 +335,6 @@ finish:
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
-static void enforce_vmpl0(void)
-{
- u64 attrs;
- int err;
-
- /*
- * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
- * higher) privilege level. Here, clear the VMPL1 permission mask of the
- * GHCB page. If the guest is not running at VMPL0, this will fail.
- *
- * If the guest is running at VMPL0, it will succeed. Even if that operation
- * modifies permission bits, it is still ok to do so currently because Linux
- * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
- * changing is a don't-care.
- */
- attrs = 1;
- if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
-}
-
/*
* SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
* guest side implementation for proper functioning of the guest. If any
@@ -413,6 +393,85 @@ void snp_check_features(void)
}
}
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+static bool early_snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
/*
* sev_check_cpu_support - Check for SEV support in the CPU capabilities
*
@@ -463,7 +522,7 @@ void sev_enable(struct boot_params *bp)
bp->cc_blob_address = 0;
/*
- * Do an initial SEV capability check before snp_init() which
+ * Do an initial SEV capability check before early_snp_init() which
* loads the CPUID page and the same checks afterwards are done
* without the hypervisor and are trustworthy.
*
@@ -478,7 +537,7 @@ void sev_enable(struct boot_params *bp)
* Setup/preliminary detection of SNP. This will be sanity-checked
* against CPUID/MSR values later.
*/
- snp = snp_init(bp);
+ snp = early_snp_init(bp);
/* Now repeat the checks with the SNP CPUID table. */
@@ -509,7 +568,20 @@ void sev_enable(struct boot_params *bp)
if (!(get_hv_features() & GHCB_HV_FT_SNP))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
- enforce_vmpl0();
+ /*
+ * Enforce running at VMPL0.
+ *
+ * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
+ * higher) privilege level. Here, clear the VMPL1 permission mask of the
+ * GHCB page. If the guest is not running at VMPL0, this will fail.
+ *
+ * If the guest is running at VMPL0, it will succeed. Even if that operation
+ * modifies permission bits, it is still ok to do so currently because Linux
+ * SNP guests running at VMPL0 only run at VMPL0, so VMPL1 or higher
+ * permission mask changes are a don't-care.
+ */
+ if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
}
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
@@ -535,85 +607,6 @@ u64 sev_get_status(void)
return m.q;
}
-/* Search for Confidential Computing blob in the EFI config table. */
-static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
-{
- unsigned long cfg_table_pa;
- unsigned int cfg_table_len;
- int ret;
-
- ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
- if (ret)
- return NULL;
-
- return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
- cfg_table_len,
- EFI_CC_BLOB_GUID);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the boot kernel
- * by firmware/bootloader in the following ways:
- *
- * - via an entry in the EFI config table
- * - via a setup_data structure, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- cc_info = find_cc_blob_efi(bp);
- if (cc_info)
- goto found_cc_info;
-
- cc_info = find_cc_blob_setup_data(bp);
- if (!cc_info)
- return NULL;
-
-found_cc_info:
- if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-
- return cc_info;
-}
-
-/*
- * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
- * will verify the SNP CPUID/MSR bits.
- */
-bool snp_init(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- if (!bp)
- return false;
-
- cc_info = find_cc_blob(bp);
- if (!cc_info)
- return false;
-
- /*
- * If a SNP-specific Confidential Computing blob is present, then
- * firmware/bootloader have indicated SNP support. Verifying this
- * involves CPUID checks which will be more reliable if the SNP
- * CPUID table is used. See comments over snp_setup_cpuid_table() for
- * more details.
- */
- setup_cpuid_table(cc_info);
-
- /*
- * Pass run-time kernel a pointer to CC info via boot_params so EFI
- * config table doesn't need to be searched again during early startup
- * phase.
- */
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
- return true;
-}
-
void sev_prep_identity_maps(unsigned long top_level_pgt)
{
/*
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index c4ea5258ab55..9049f390d834 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -119,8 +119,8 @@ static void init_heap(void)
char *stack_end;
if (boot_params.hdr.loadflags & CAN_USE_HEAP) {
- asm("leal %P1(%%esp),%0"
- : "=r" (stack_end) : "i" (-STACK_SIZE));
+ asm("leal %n1(%%esp),%0"
+ : "=r" (stack_end) : "i" (STACK_SIZE));
heap_end = (char *)
((size_t)boot_params.hdr.heap_end_ptr + 0x200);
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index d07be9d05cd0..b31ef2424d19 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -3,19 +3,28 @@
* Confidential Computing Platform Capability checks
*
* Copyright (C) 2021 Advanced Micro Devices, Inc.
+ * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
#include <linux/export.h>
#include <linux/cc_platform.h>
+#include <linux/string.h>
+#include <linux/random.h>
+#include <asm/archrandom.h>
#include <asm/coco.h>
#include <asm/processor.h>
enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
u64 cc_mask __ro_after_init;
+static struct cc_attr_flags {
+ __u64 host_sev_snp : 1,
+ __resv : 63;
+} cc_flags;
+
static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
switch (attr) {
@@ -89,6 +98,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr)
case CC_ATTR_GUEST_SEV_SNP:
return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+ case CC_ATTR_HOST_SEV_SNP:
+ return cc_flags.host_sev_snp;
+
default:
return false;
}
@@ -148,3 +160,84 @@ u64 cc_mkdec(u64 val)
}
}
EXPORT_SYMBOL_GPL(cc_mkdec);
+
+static void amd_cc_platform_clear(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_HOST_SEV_SNP:
+ cc_flags.host_sev_snp = 0;
+ break;
+ default:
+ break;
+ }
+}
+
+void cc_platform_clear(enum cc_attr attr)
+{
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ amd_cc_platform_clear(attr);
+ break;
+ default:
+ break;
+ }
+}
+
+static void amd_cc_platform_set(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_HOST_SEV_SNP:
+ cc_flags.host_sev_snp = 1;
+ break;
+ default:
+ break;
+ }
+}
+
+void cc_platform_set(enum cc_attr attr)
+{
+ switch (cc_vendor) {
+ case CC_VENDOR_AMD:
+ amd_cc_platform_set(attr);
+ break;
+ default:
+ break;
+ }
+}
+
+__init void cc_random_init(void)
+{
+ /*
+ * The seed is 32 bytes (in units of longs), which is 256 bits, which
+ * is the security level that the RNG is targeting.
+ */
+ unsigned long rng_seed[32 / sizeof(long)];
+ size_t i, longs;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * Since the CoCo threat model includes the host, the only reliable
+ * source of entropy that can be neither observed nor manipulated is
+ * RDRAND. Usually, RDRAND failure is considered tolerable, but since
+ * CoCo guests have no other unobservable source of entropy, it's
+ * important to at least ensure the RNG gets some initial random seeds.
+ */
+ for (i = 0; i < ARRAY_SIZE(rng_seed); i += longs) {
+ longs = arch_get_random_longs(&rng_seed[i], ARRAY_SIZE(rng_seed) - i);
+
+ /*
+ * A zero return value means that the guest doesn't have RDRAND
+ * or the CPU is physically broken, and in both cases that
+ * means most crypto inside of the CoCo instance will be
+ * broken, defeating the purpose of CoCo in the first place. So
+ * just panic here because it's absolutely unsafe to continue
+ * executing.
+ */
+ if (longs == 0)
+ panic("RDRAND is defective.");
+ }
+ add_device_randomness(rng_seed, sizeof(rng_seed));
+ memzero_explicit(rng_seed, sizeof(rng_seed));
+}
diff --git a/arch/x86/configs/hardening.config b/arch/x86/configs/hardening.config
index 7b497f3b7bc3..de319852a1e9 100644
--- a/arch/x86/configs/hardening.config
+++ b/arch/x86/configs/hardening.config
@@ -10,5 +10,8 @@ CONFIG_INTEL_IOMMU_DEFAULT_ON=y
CONFIG_INTEL_IOMMU_SVM=y
CONFIG_AMD_IOMMU=y
+# Enforce CET Indirect Branch Tracking in the kernel.
+CONFIG_X86_KERNEL_IBT=y
+
# Enable CET Shadow Stack for userspace.
CONFIG_X86_USER_SHADOW_STACK=y
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 66c9e2aab16c..be3ee4294903 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,5 +1,6 @@
CONFIG_NOHIGHMEM=y
# CONFIG_HIGHMEM4G is not set
# CONFIG_HIGHMEM64G is not set
+# CONFIG_UNWINDER_ORC is not set
CONFIG_UNWINDER_GUESS=y
# CONFIG_UNWINDER_FRAME_POINTER is not set
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9aa46093c91b..9c5ce5613738 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
+ aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
new file mode 100644
index 000000000000..48f97b79f7a9
--- /dev/null
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -0,0 +1,845 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * AES-XTS for modern x86_64 CPUs
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+/*
+ * This file implements AES-XTS for modern x86_64 CPUs. To handle the
+ * complexities of coding for x86 SIMD, e.g. where every vector length needs
+ * different code, it uses a macro to generate several implementations that
+ * share similar source code but are targeted at different CPUs, listed below:
+ *
+ * AES-NI + AVX
+ * - 128-bit vectors (1 AES block per vector)
+ * - VEX-coded instructions
+ * - xmm0-xmm15
+ * - This is for older CPUs that lack VAES but do have AVX.
+ *
+ * VAES + VPCLMULQDQ + AVX2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - VEX-coded instructions
+ * - ymm0-ymm15
+ * - This is for CPUs that have VAES but lack AVX512 or AVX10,
+ * e.g. Intel's Alder Lake and AMD's Zen 3.
+ *
+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
+ * - 256-bit vectors (2 AES blocks per vector)
+ * - EVEX-coded instructions
+ * - ymm0-ymm31
+ * - This is for CPUs that have AVX512 but where using zmm registers causes
+ * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
+ * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
+ * To avoid confusion with 512-bit, we just write AVX10/256.
+ *
+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
+ * - Same as the previous one, but upgrades to 512-bit vectors
+ * (4 AES blocks per vector) in zmm0-zmm31.
+ * - This is for CPUs that have good AVX512 or AVX10/512 support.
+ *
+ * This file doesn't have an implementation for AES-NI alone (without AVX), as
+ * the lack of VEX would make all the assembly code different.
+ *
+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
+ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
+ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
+ * need to start also providing an implementation using VAES alone.
+ *
+ * The AES-XTS implementations in this file support everything required by the
+ * crypto API, including support for arbitrary input lengths and multi-part
+ * processing. However, they are most heavily optimized for the common case of
+ * power-of-2 length inputs that are processed in a single part (disk sectors).
+ */
+
+#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+
+.section .rodata
+.p2align 4
+.Lgf_poly:
+ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
+ // + 1. It is the value that must be XOR'd into the low 64 bits of the
+ // tweak each time a 1 is carried out of the high 64 bits.
+ //
+ // The high 64 bits of this value is just the internal carry bit that
+ // exists when there's a carry out of the low 64 bits of the tweak.
+ .quad 0x87, 1
+
+ // This table contains constants for vpshufb and vpblendvb, used to
+ // handle variable byte shifts and blending during ciphertext stealing
+ // on CPUs that don't support AVX10-style masking.
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+.text
+
+// Function parameters
+.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
+ // advanced to point to 7th-from-last round key
+.set SRC, %rsi // Pointer to next source data
+.set DST, %rdx // Pointer to next destination data
+.set LEN, %ecx // Remaining length in bytes
+.set LEN8, %cl
+.set LEN64, %rcx
+.set TWEAK, %r8 // Pointer to next tweak
+
+// %rax holds the AES key length in bytes.
+.set KEYLEN, %eax
+.set KEYLEN64, %rax
+
+// %r9-r11 are available as temporaries.
+
+.macro _define_Vi i
+.if VL == 16
+ .set V\i, %xmm\i
+.elseif VL == 32
+ .set V\i, %ymm\i
+.elseif VL == 64
+ .set V\i, %zmm\i
+.else
+ .error "Unsupported Vector Length (VL)"
+.endif
+.endm
+
+.macro _define_aliases
+ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
+ // are available, that map to the xmm, ymm, or zmm registers according
+ // to the selected Vector Length (VL).
+ _define_Vi 0
+ _define_Vi 1
+ _define_Vi 2
+ _define_Vi 3
+ _define_Vi 4
+ _define_Vi 5
+ _define_Vi 6
+ _define_Vi 7
+ _define_Vi 8
+ _define_Vi 9
+ _define_Vi 10
+ _define_Vi 11
+ _define_Vi 12
+ _define_Vi 13
+ _define_Vi 14
+ _define_Vi 15
+.if USE_AVX10
+ _define_Vi 16
+ _define_Vi 17
+ _define_Vi 18
+ _define_Vi 19
+ _define_Vi 20
+ _define_Vi 21
+ _define_Vi 22
+ _define_Vi 23
+ _define_Vi 24
+ _define_Vi 25
+ _define_Vi 26
+ _define_Vi 27
+ _define_Vi 28
+ _define_Vi 29
+ _define_Vi 30
+ _define_Vi 31
+.endif
+
+ // V0-V3 hold the data blocks during the main loop, or temporary values
+ // otherwise. V4-V5 hold temporary values.
+
+ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
+ .set TWEAK0_XMM, %xmm6
+ .set TWEAK0, V6
+ .set TWEAK1_XMM, %xmm7
+ .set TWEAK1, V7
+ .set TWEAK2, V8
+ .set TWEAK3, V9
+
+ // V10-V13 are used for computing the next values of TWEAK[0-3].
+ .set NEXT_TWEAK0, V10
+ .set NEXT_TWEAK1, V11
+ .set NEXT_TWEAK2, V12
+ .set NEXT_TWEAK3, V13
+
+ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
+ .set GF_POLY_XMM, %xmm14
+ .set GF_POLY, V14
+
+ // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
+ .set KEY0_XMM, %xmm15
+ .set KEY0, V15
+
+ // If 32 SIMD registers are available, then V16-V29 hold the remaining
+ // AES round keys, copied to all 128-bit lanes.
+ //
+ // AES-128, AES-192, and AES-256 use different numbers of round keys.
+ // To allow handling all three variants efficiently, we align the round
+ // keys to the *end* of this register range. I.e., AES-128 uses
+ // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
+ // (All also use KEY0 for the XOR-only "round" at the beginning.)
+.if USE_AVX10
+ .set KEY1_XMM, %xmm16
+ .set KEY1, V16
+ .set KEY2_XMM, %xmm17
+ .set KEY2, V17
+ .set KEY3_XMM, %xmm18
+ .set KEY3, V18
+ .set KEY4_XMM, %xmm19
+ .set KEY4, V19
+ .set KEY5_XMM, %xmm20
+ .set KEY5, V20
+ .set KEY6_XMM, %xmm21
+ .set KEY6, V21
+ .set KEY7_XMM, %xmm22
+ .set KEY7, V22
+ .set KEY8_XMM, %xmm23
+ .set KEY8, V23
+ .set KEY9_XMM, %xmm24
+ .set KEY9, V24
+ .set KEY10_XMM, %xmm25
+ .set KEY10, V25
+ .set KEY11_XMM, %xmm26
+ .set KEY11, V26
+ .set KEY12_XMM, %xmm27
+ .set KEY12, V27
+ .set KEY13_XMM, %xmm28
+ .set KEY13, V28
+ .set KEY14_XMM, %xmm29
+ .set KEY14, V29
+.endif
+ // V30-V31 are currently unused.
+.endm
+
+// Move a vector between memory and a register.
+.macro _vmovdqu src, dst
+.if VL < 64
+ vmovdqu \src, \dst
+.else
+ vmovdqu8 \src, \dst
+.endif
+.endm
+
+// Broadcast a 128-bit value into a vector.
+.macro _vbroadcast128 src, dst
+.if VL == 16 && !USE_AVX10
+ vmovdqu \src, \dst
+.elseif VL == 32 && !USE_AVX10
+ vbroadcasti128 \src, \dst
+.else
+ vbroadcasti32x4 \src, \dst
+.endif
+.endm
+
+// XOR two vectors together.
+.macro _vpxor src1, src2, dst
+.if USE_AVX10
+ vpxord \src1, \src2, \dst
+.else
+ vpxor \src1, \src2, \dst
+.endif
+.endm
+
+// XOR three vectors together.
+.macro _xor3 src1, src2, src3_and_dst
+.if USE_AVX10
+ // vpternlogd with immediate 0x96 is a three-argument XOR.
+ vpternlogd $0x96, \src1, \src2, \src3_and_dst
+.else
+ vpxor \src1, \src3_and_dst, \src3_and_dst
+ vpxor \src2, \src3_and_dst, \src3_and_dst
+.endif
+.endm
+
+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
+// (by multiplying by the polynomial 'x') and write it to \dst.
+.macro _next_tweak src, tmp, dst
+ vpshufd $0x13, \src, \tmp
+ vpaddq \src, \src, \dst
+ vpsrad $31, \tmp, \tmp
+ vpand GF_POLY_XMM, \tmp, \tmp
+ vpxor \tmp, \dst, \dst
+.endm
+
+// Given the XTS tweak(s) in the vector \src, compute the next vector of
+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
+//
+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
+// all tweaks in the vector in parallel. If VL=16, we just do the regular
+// computation without vpclmulqdq, as it's the faster method for a single tweak.
+.macro _next_tweakvec src, tmp1, tmp2, dst
+.if VL == 16
+ _next_tweak \src, \tmp1, \dst
+.else
+ vpsrlq $64 - VL/16, \src, \tmp1
+ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
+ vpslldq $8, \tmp1, \tmp1
+ vpsllq $VL/16, \src, \dst
+ _xor3 \tmp1, \tmp2, \dst
+.endif
+.endm
+
+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
+// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
+.macro _compute_first_set_of_tweaks
+ vmovdqu (TWEAK), TWEAK0_XMM
+ _vbroadcast128 .Lgf_poly(%rip), GF_POLY
+.if VL == 16
+ // With VL=16, multiplying by x serially is fastest.
+ _next_tweak TWEAK0, %xmm0, TWEAK1
+ _next_tweak TWEAK1, %xmm0, TWEAK2
+ _next_tweak TWEAK2, %xmm0, TWEAK3
+.else
+.if VL == 32
+ // Compute the second block of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ vinserti128 $1, %xmm1, TWEAK0, TWEAK0
+.elseif VL == 64
+ // Compute the remaining blocks of TWEAK0.
+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1
+ _next_tweak %xmm1, %xmm0, %xmm2
+ _next_tweak %xmm2, %xmm0, %xmm3
+ vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
+ vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
+ vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
+.endif
+ // Compute TWEAK[1-3] from TWEAK0.
+ vpsrlq $64 - 1*VL/16, TWEAK0, V0
+ vpsrlq $64 - 2*VL/16, TWEAK0, V2
+ vpsrlq $64 - 3*VL/16, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V2, V2
+ vpslldq $8, V4, V4
+ vpsllq $1*VL/16, TWEAK0, TWEAK1
+ vpsllq $2*VL/16, TWEAK0, TWEAK2
+ vpsllq $3*VL/16, TWEAK0, TWEAK3
+.if USE_AVX10
+ vpternlogd $0x96, V0, V1, TWEAK1
+ vpternlogd $0x96, V2, V3, TWEAK2
+ vpternlogd $0x96, V4, V5, TWEAK3
+.else
+ vpxor V0, TWEAK1, TWEAK1
+ vpxor V2, TWEAK2, TWEAK2
+ vpxor V4, TWEAK3, TWEAK3
+ vpxor V1, TWEAK1, TWEAK1
+ vpxor V3, TWEAK2, TWEAK2
+ vpxor V5, TWEAK3, TWEAK3
+.endif
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the method of just
+// multiplying by x repeatedly (the same method _next_tweak uses).
+.macro _tweak_step_mulx i
+.if \i == 0
+ .set PREV_TWEAK, TWEAK3
+ .set NEXT_TWEAK, NEXT_TWEAK0
+.elseif \i == 5
+ .set PREV_TWEAK, NEXT_TWEAK0
+ .set NEXT_TWEAK, NEXT_TWEAK1
+.elseif \i == 10
+ .set PREV_TWEAK, NEXT_TWEAK1
+ .set NEXT_TWEAK, NEXT_TWEAK2
+.elseif \i == 15
+ .set PREV_TWEAK, NEXT_TWEAK2
+ .set NEXT_TWEAK, NEXT_TWEAK3
+.endif
+.if \i >= 0 && \i < 20 && \i % 5 == 0
+ vpshufd $0x13, PREV_TWEAK, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 1
+ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
+.elseif \i >= 0 && \i < 20 && \i % 5 == 2
+ vpsrad $31, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 3
+ vpand GF_POLY, V5, V5
+.elseif \i >= 0 && \i < 20 && \i % 5 == 4
+ vpxor V5, NEXT_TWEAK, NEXT_TWEAK
+.elseif \i == 1000
+ vmovdqa NEXT_TWEAK0, TWEAK0
+ vmovdqa NEXT_TWEAK1, TWEAK1
+ vmovdqa NEXT_TWEAK2, TWEAK2
+ vmovdqa NEXT_TWEAK3, TWEAK3
+.endif
+.endm
+
+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
+// (the same method _next_tweakvec uses for VL > 16). This means multiplying
+// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
+.macro _tweak_step_pclmul i
+.if \i == 0
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
+.elseif \i == 2
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
+.elseif \i == 4
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
+.elseif \i == 6
+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
+.elseif \i == 8
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
+.elseif \i == 10
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
+.elseif \i == 12
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
+.elseif \i == 14
+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
+.elseif \i == 1000
+ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
+ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
+ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
+ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
+ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
+ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
+ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
+ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
+.endif
+.endm
+
+// _tweak_step does one step of the computation of the next set of tweaks from
+// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
+// \i that include at least 0 through 19, then 1000 which signals the last step.
+//
+// This is used to interleave the computation of the next set of tweaks with the
+// AES en/decryptions, which increases performance in some cases.
+.macro _tweak_step i
+.if VL == 16
+ _tweak_step_mulx \i
+.else
+ _tweak_step_pclmul \i
+.endif
+.endm
+
+.macro _setup_round_keys enc
+
+ // Select either the encryption round keys or the decryption round keys.
+.if \enc
+ .set OFFS, 0
+.else
+ .set OFFS, 240
+.endif
+
+ // Load the round key for "round 0".
+ _vbroadcast128 OFFS(KEY), KEY0
+
+ // Increment KEY to make it so that 7*16(KEY) is the last round key.
+ // For AES-128, increment by 3*16, resulting in the 10 round keys (not
+ // counting the zero-th round key which was just loaded into KEY0) being
+ // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
+ // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
+ // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
+ //
+ // This rebasing provides two benefits. First, it makes the offset to
+ // any round key be in the range [-96, 112], fitting in a signed byte.
+ // This shortens VEX-encoded instructions that access the later round
+ // keys which otherwise would need 4-byte offsets. Second, it makes it
+ // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
+ // beginning. Skipping rounds at the end doesn't work as well because
+ // the last round needs different instructions.
+ //
+ // An alternative approach would be to roll up all the round loops. We
+ // don't do that because it isn't compatible with caching the round keys
+ // in registers which we do when possible (see below), and also because
+ // it seems unwise to rely *too* heavily on the CPU's branch predictor.
+ lea OFFS-16(KEY, KEYLEN64, 4), KEY
+
+ // If all 32 SIMD registers are available, cache all the round keys.
+.if USE_AVX10
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vbroadcast128 -6*16(KEY), KEY1
+ _vbroadcast128 -5*16(KEY), KEY2
+.Laes192\@:
+ _vbroadcast128 -4*16(KEY), KEY3
+ _vbroadcast128 -3*16(KEY), KEY4
+.Laes128\@:
+ _vbroadcast128 -2*16(KEY), KEY5
+ _vbroadcast128 -1*16(KEY), KEY6
+ _vbroadcast128 0*16(KEY), KEY7
+ _vbroadcast128 1*16(KEY), KEY8
+ _vbroadcast128 2*16(KEY), KEY9
+ _vbroadcast128 3*16(KEY), KEY10
+ _vbroadcast128 4*16(KEY), KEY11
+ _vbroadcast128 5*16(KEY), KEY12
+ _vbroadcast128 6*16(KEY), KEY13
+ _vbroadcast128 7*16(KEY), KEY14
+.endif
+.endm
+
+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
+// on the block(s) in \data using the round key(s) in \key. The register length
+// determines the number of AES blocks en/decrypted.
+.macro _vaes enc, last, key, data
+.if \enc
+.if \last
+ vaesenclast \key, \data, \data
+.else
+ vaesenc \key, \data, \data
+.endif
+.else
+.if \last
+ vaesdeclast \key, \data, \data
+.else
+ vaesdec \key, \data, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the block(s) in \data, using the
+// same key for all block(s). The round key is loaded from the appropriate
+// register or memory location for round \i. May clobber V4.
+.macro _vaes_1x enc, last, i, xmm_suffix, data
+.if USE_AVX10
+ _vaes \enc, \last, KEY\i\xmm_suffix, \data
+.else
+.ifnb \xmm_suffix
+ _vaes \enc, \last, (\i-7)*16(KEY), \data
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _vaes \enc, \last, V4, \data
+.endif
+.endif
+.endm
+
+// Do a single round of AES en/decryption on the blocks in registers V0-V3,
+// using the same key for all blocks. The round key is loaded from the
+// appropriate register or memory location for round \i. In addition, does two
+// steps of the computation of the next set of tweaks. May clobber V4.
+.macro _vaes_4x enc, last, i
+.if USE_AVX10
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, KEY\i, V0
+ _vaes \enc, \last, KEY\i, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, KEY\i, V2
+ _vaes \enc, \last, KEY\i, V3
+.else
+ _vbroadcast128 (\i-7)*16(KEY), V4
+ _tweak_step (2*(\i-5))
+ _vaes \enc, \last, V4, V0
+ _vaes \enc, \last, V4, V1
+ _tweak_step (2*(\i-5) + 1)
+ _vaes \enc, \last, V4, V2
+ _vaes \enc, \last, V4, V3
+.endif
+.endm
+
+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
+// then XOR with \tweak again) of the block(s) in \data. To process a single
+// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
+// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
+.macro _aes_crypt enc, xmm_suffix, tweak, data
+ _xor3 KEY0\xmm_suffix, \tweak, \data
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ _vaes_1x \enc, 0, 1, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 2, \xmm_suffix, \data
+.Laes192\@:
+ _vaes_1x \enc, 0, 3, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 4, \xmm_suffix, \data
+.Laes128\@:
+ _vaes_1x \enc, 0, 5, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 6, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 7, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 8, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 9, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 10, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 11, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 12, \xmm_suffix, \data
+ _vaes_1x \enc, 0, 13, \xmm_suffix, \data
+ _vaes_1x \enc, 1, 14, \xmm_suffix, \data
+ _vpxor \tweak, \data, \data
+.endm
+
+.macro _aes_xts_crypt enc
+ _define_aliases
+
+.if !\enc
+ // When decrypting a message whose length isn't a multiple of the AES
+ // block length, exclude the last full block from the main loop by
+ // subtracting 16 from LEN. This is needed because ciphertext stealing
+ // decryption uses the last two tweaks in reverse order. We'll handle
+ // the last full block and the partial block specially at the end.
+ lea -16(LEN), %eax
+ test $15, LEN8
+ cmovnz %eax, LEN
+.endif
+
+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
+ movl 480(KEY), KEYLEN
+
+ // Setup the pointer to the round keys and cache as many as possible.
+ _setup_round_keys \enc
+
+ // Compute the first set of tweaks TWEAK[0-3].
+ _compute_first_set_of_tweaks
+
+ sub $4*VL, LEN
+ jl .Lhandle_remainder\@
+
+.Lmain_loop\@:
+ // This is the main loop, en/decrypting 4*VL bytes per iteration.
+
+ // XOR each source block with its tweak and the zero-th round key.
+.if USE_AVX10
+ vmovdqu8 0*VL(SRC), V0
+ vmovdqu8 1*VL(SRC), V1
+ vmovdqu8 2*VL(SRC), V2
+ vmovdqu8 3*VL(SRC), V3
+ vpternlogd $0x96, TWEAK0, KEY0, V0
+ vpternlogd $0x96, TWEAK1, KEY0, V1
+ vpternlogd $0x96, TWEAK2, KEY0, V2
+ vpternlogd $0x96, TWEAK3, KEY0, V3
+.else
+ vpxor 0*VL(SRC), KEY0, V0
+ vpxor 1*VL(SRC), KEY0, V1
+ vpxor 2*VL(SRC), KEY0, V2
+ vpxor 3*VL(SRC), KEY0, V3
+ vpxor TWEAK0, V0, V0
+ vpxor TWEAK1, V1, V1
+ vpxor TWEAK2, V2, V2
+ vpxor TWEAK3, V3, V3
+.endif
+ cmp $24, KEYLEN
+ jl .Laes128\@
+ je .Laes192\@
+ // Do all the AES rounds on the data blocks, interleaved with
+ // the computation of the next set of tweaks.
+ _vaes_4x \enc, 0, 1
+ _vaes_4x \enc, 0, 2
+.Laes192\@:
+ _vaes_4x \enc, 0, 3
+ _vaes_4x \enc, 0, 4
+.Laes128\@:
+ _vaes_4x \enc, 0, 5
+ _vaes_4x \enc, 0, 6
+ _vaes_4x \enc, 0, 7
+ _vaes_4x \enc, 0, 8
+ _vaes_4x \enc, 0, 9
+ _vaes_4x \enc, 0, 10
+ _vaes_4x \enc, 0, 11
+ _vaes_4x \enc, 0, 12
+ _vaes_4x \enc, 0, 13
+ _vaes_4x \enc, 1, 14
+
+ // XOR in the tweaks again.
+ _vpxor TWEAK0, V0, V0
+ _vpxor TWEAK1, V1, V1
+ _vpxor TWEAK2, V2, V2
+ _vpxor TWEAK3, V3, V3
+
+ // Store the destination blocks.
+ _vmovdqu V0, 0*VL(DST)
+ _vmovdqu V1, 1*VL(DST)
+ _vmovdqu V2, 2*VL(DST)
+ _vmovdqu V3, 3*VL(DST)
+
+ // Finish computing the next set of tweaks.
+ _tweak_step 1000
+
+ add $4*VL, SRC
+ add $4*VL, DST
+ sub $4*VL, LEN
+ jge .Lmain_loop\@
+
+ // Check for the uncommon case where the data length isn't a multiple of
+ // 4*VL. Handle it out-of-line in order to optimize for the common
+ // case. In the common case, just fall through to the ret.
+ test $4*VL-1, LEN8
+ jnz .Lhandle_remainder\@
+.Ldone\@:
+ // Store the next tweak back to *TWEAK to support continuation calls.
+ vmovdqu TWEAK0_XMM, (TWEAK)
+.if VL > 16
+ vzeroupper
+.endif
+ RET
+
+.Lhandle_remainder\@:
+
+ // En/decrypt any remaining full blocks, one vector at a time.
+.if VL > 16
+ add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
+ jl .Lvec_at_a_time_done\@
+.Lvec_at_a_time\@:
+ _vmovdqu (SRC), V0
+ _aes_crypt \enc, , TWEAK0, V0
+ _vmovdqu V0, (DST)
+ _next_tweakvec TWEAK0, V0, V1, TWEAK0
+ add $VL, SRC
+ add $VL, DST
+ sub $VL, LEN
+ jge .Lvec_at_a_time\@
+.Lvec_at_a_time_done\@:
+ add $VL-16, LEN // Undo extra sub of VL, then sub 16.
+.else
+ add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
+.endif
+
+ // En/decrypt any remaining full blocks, one at a time.
+ jl .Lblock_at_a_time_done\@
+.Lblock_at_a_time\@:
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
+ add $16, SRC
+ add $16, DST
+ sub $16, LEN
+ jge .Lblock_at_a_time\@
+.Lblock_at_a_time_done\@:
+ add $16, LEN // Undo the extra sub of 16.
+ // Now 0 <= LEN <= 15. If LEN is zero, we're done.
+ jz .Ldone\@
+
+ // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
+ // Do ciphertext stealing to process the last 16 + LEN bytes.
+
+.if \enc
+ // If encrypting, the main loop already encrypted the last full block to
+ // create the CTS intermediate ciphertext. Prepare for the rest of CTS
+ // by rewinding the pointers and loading the intermediate ciphertext.
+ sub $16, SRC
+ sub $16, DST
+ vmovdqu (DST), %xmm0
+.else
+ // If decrypting, the main loop didn't decrypt the last full block
+ // because CTS decryption uses the last two tweaks in reverse order.
+ // Do it now by advancing the tweak and decrypting the last full block.
+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
+ vmovdqu (SRC), %xmm0
+ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
+.endif
+
+.if USE_AVX10
+ // Create a mask that has the first LEN bits set.
+ mov $-1, %r9d
+ bzhi LEN, %r9d, %r9d
+ kmovd %r9d, %k1
+
+ // Swap the first LEN bytes of the en/decryption of the last full block
+ // with the partial block. Note that to support in-place en/decryption,
+ // the load from the src partial block must happen before the store to
+ // the dst partial block.
+ vmovdqa %xmm0, %xmm1
+ vmovdqu8 16(SRC), %xmm0{%k1}
+ vmovdqu8 %xmm1, 16(DST){%k1}
+.else
+ lea .Lcts_permute_table(%rip), %r9
+
+ // Load the src partial block, left-aligned. Note that to support
+ // in-place en/decryption, this must happen before the store to the dst
+ // partial block.
+ vmovdqu (SRC, LEN64, 1), %xmm1
+
+ // Shift the first LEN bytes of the en/decryption of the last full block
+ // to the end of a register, then store it to DST+LEN. This stores the
+ // dst partial block. It also writes to the second part of the dst last
+ // full block, but that part is overwritten later.
+ vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
+ vmovdqu %xmm2, (DST, LEN64, 1)
+
+ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
+ sub LEN64, %r9
+ vmovdqu 32(%r9), %xmm3
+
+ // Shift the src partial block to the beginning of its register.
+ vpshufb %xmm3, %xmm1, %xmm1
+
+ // Do a blend to generate the src partial block followed by the second
+ // part of the en/decryption of the last full block.
+ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
+.endif
+ // En/decrypt again and store the last full block.
+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
+ vmovdqu %xmm0, (DST)
+ jmp .Ldone\@
+.endm
+
+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+// u8 iv[AES_BLOCK_SIZE]);
+SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
+ vmovdqu (%rsi), %xmm0
+ vpxor (%rdi), %xmm0, %xmm0
+ movl 480(%rdi), %eax // AES key length
+ lea -16(%rdi, %rax, 4), %rdi
+ cmp $24, %eax
+ jl .Lencrypt_iv_aes128
+ je .Lencrypt_iv_aes192
+ vaesenc -6*16(%rdi), %xmm0, %xmm0
+ vaesenc -5*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes192:
+ vaesenc -4*16(%rdi), %xmm0, %xmm0
+ vaesenc -3*16(%rdi), %xmm0, %xmm0
+.Lencrypt_iv_aes128:
+ vaesenc -2*16(%rdi), %xmm0, %xmm0
+ vaesenc -1*16(%rdi), %xmm0, %xmm0
+ vaesenc 0*16(%rdi), %xmm0, %xmm0
+ vaesenc 1*16(%rdi), %xmm0, %xmm0
+ vaesenc 2*16(%rdi), %xmm0, %xmm0
+ vaesenc 3*16(%rdi), %xmm0, %xmm0
+ vaesenc 4*16(%rdi), %xmm0, %xmm0
+ vaesenc 5*16(%rdi), %xmm0, %xmm0
+ vaesenc 6*16(%rdi), %xmm0, %xmm0
+ vaesenclast 7*16(%rdi), %xmm0, %xmm0
+ vmovdqu %xmm0, (%rsi)
+ RET
+SYM_FUNC_END(aes_xts_encrypt_iv)
+
+// Below are the actual AES-XTS encryption and decryption functions,
+// instantiated from the above macro. They all have the following prototype:
+//
+// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
+// const u8 *src, u8 *dst, unsigned int len,
+// u8 tweak[AES_BLOCK_SIZE]);
+//
+// |key| is the data key. |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done. This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call. If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+
+.set VL, 16
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+.set VL, 32
+.set USE_AVX10, 0
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
+
+.set VL, 32
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
+
+.set VL, 64
+.set USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+ _aes_xts_crypt 1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+ _aes_xts_crypt 0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 411d8c83e88a..39066b57a70e 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -83,9 +83,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.text
-
-#define STACK_OFFSET 8*3
-
#define AadHash 16*0
#define AadLen 16*1
#define InLen (16*1)+8
@@ -116,11 +113,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define arg4 rcx
#define arg5 r8
#define arg6 r9
-#define arg7 STACK_OFFSET+8(%rsp)
-#define arg8 STACK_OFFSET+16(%rsp)
-#define arg9 STACK_OFFSET+24(%rsp)
-#define arg10 STACK_OFFSET+32(%rsp)
-#define arg11 STACK_OFFSET+40(%rsp)
#define keysize 2*15*16(%arg1)
#endif
@@ -1507,184 +1499,6 @@ _esb_loop_\@:
MOVADQ (%r10),\TMP1
aesenclast \TMP1,\XMM0
.endm
-/*****************************************************************************
-* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Plaintext output. Encrypt in-place is allowed.
-* const u8 *in, // Ciphertext input
-* u64 plaintext_len, // Length of data in bytes for decryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
-* // given authentication tag and only return the plaintext if they match.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
-* // (most likely), 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the first
-* set of 11 keys in the data structure void *aes_ctx
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-*
-*****************************************************************************/
-SYM_FUNC_START(aesni_gcm_dec)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC dec
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_dec)
-
-
-/*****************************************************************************
-* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
-* struct gcm_context_data *data
-* // Context data
-* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
-* const u8 *in, // Plaintext input
-* u64 plaintext_len, // Length of data in bytes for encryption.
-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-* // concatenated with 0x00000001. 16-byte aligned pointer.
-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
-* const u8 *aad, // Additional Authentication Data (AAD)
-* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
-* u8 *auth_tag, // Authenticated Tag output.
-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-* // 12 or 8.
-*
-* Assumptions:
-*
-* keys:
-* keys are pre-expanded and aligned to 16 bytes. we are using the
-* first set of 11 keys in the data structure void *aes_ctx
-*
-*
-* iv:
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Salt (From the SA) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | Initialization Vector |
-* | (This is the sequence number from IPSec header) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x1 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-*
-*
-* AAD:
-* AAD padded to 128 bits with 0
-* for example, assume AAD is a u32 vector
-*
-* if AAD is 8 bytes:
-* AAD[3] = {A0, A1};
-* padded AAD in xmm register = {A1 A0 0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A1) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 32-bit Sequence Number (A0) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 32-bit Sequence Number
-*
-* if AAD is 12 bytes:
-* AAD[3] = {A0, A1, A2};
-* padded AAD in xmm register = {A2 A1 A0 0}
-*
-* 0 1 2 3
-* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | SPI (A2) |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 64-bit Extended Sequence Number {A1,A0} |
-* | |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-* | 0x0 |
-* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-*
-* AAD Format with 64-bit Extended Sequence Number
-*
-* poly = x^128 + x^127 + x^126 + x^121 + 1
-***************************************************************************/
-SYM_FUNC_START(aesni_gcm_enc)
- FUNC_SAVE
-
- GCM_INIT %arg6, arg7, arg8, arg9
- GCM_ENC_DEC enc
-
- GCM_COMPLETE arg10, arg11
- FUNC_RESTORE
- RET
-SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
@@ -1820,8 +1634,8 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
SYM_FUNC_END(_key_expansion_256b)
/*
- * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- * unsigned int key_len)
+ * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ * unsigned int key_len)
*/
SYM_FUNC_START(aesni_set_key)
FRAME_BEGIN
@@ -1926,7 +1740,6 @@ SYM_FUNC_START(aesni_set_key)
sub $0x10, UKEYP
cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor AREG, AREG
#ifndef __x86_64__
popl KEYP
#endif
@@ -2826,28 +2639,24 @@ SYM_FUNC_END(aesni_ctr_enc)
.previous
/*
- * _aesni_gf128mul_x_ble: internal ABI
- * Multiply in GF(2^128) for XTS IVs
+ * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
* input:
* IV: current IV
* GF128MUL_MASK == mask with 0x87 and 0x01
* output:
* IV: next IV
* changed:
- * CTR: == temporary value
+ * KEY: == temporary value
*/
-#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, KEY; \
- paddq IV, IV; \
- psrad $31, KEY; \
- pand GF128MUL_MASK, KEY; \
- pxor KEY, IV;
+.macro _aesni_gf128mul_x_ble
+ pshufd $0x13, IV, KEY
+ paddq IV, IV
+ psrad $31, KEY
+ pand GF128MUL_MASK, KEY
+ pxor KEY, IV
+.endm
-/*
- * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_encrypt)
+.macro _aesni_xts_crypt enc
FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
@@ -2866,35 +2675,46 @@ SYM_FUNC_START(aesni_xts_encrypt)
movups (IVP), IV
mov 480(KEYP), KLEN
+.if !\enc
+ add $240, KEYP
-.Lxts_enc_loop4:
+ test $15, LEN
+ jz .Lxts_loop4\@
+ sub $16, LEN
+.endif
+
+.Lxts_loop4\@:
sub $64, LEN
- jl .Lxts_enc_1x
+ jl .Lxts_1x\@
movdqa IV, STATE1
movdqu 0x00(INP), IN
pxor IN, STATE1
movdqu IV, 0x00(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE2
movdqu 0x10(INP), IN
pxor IN, STATE2
movdqu IV, 0x10(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE3
movdqu 0x20(INP), IN
pxor IN, STATE3
movdqu IV, 0x20(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
movdqa IV, STATE4
movdqu 0x30(INP), IN
pxor IN, STATE4
movdqu IV, 0x30(OUTP)
+.if \enc
call _aesni_enc4
+.else
+ call _aesni_dec4
+.endif
movdqu 0x00(OUTP), IN
pxor IN, STATE1
@@ -2912,17 +2732,17 @@ SYM_FUNC_START(aesni_xts_encrypt)
pxor IN, STATE4
movdqu STATE4, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
add $64, INP
add $64, OUTP
test LEN, LEN
- jnz .Lxts_enc_loop4
+ jnz .Lxts_loop4\@
-.Lxts_enc_ret_iv:
+.Lxts_ret_iv\@:
movups IV, (IVP)
-.Lxts_enc_ret:
+.Lxts_ret\@:
#ifndef __x86_64__
popl KLEN
popl KEYP
@@ -2932,201 +2752,60 @@ SYM_FUNC_START(aesni_xts_encrypt)
FRAME_END
RET
-.Lxts_enc_1x:
+.Lxts_1x\@:
add $64, LEN
- jz .Lxts_enc_ret_iv
+ jz .Lxts_ret_iv\@
+.if \enc
sub $16, LEN
- jl .Lxts_enc_cts4
+ jl .Lxts_cts4\@
+.endif
-.Lxts_enc_loop1:
+.Lxts_loop1\@:
movdqu (INP), STATE
+.if \enc
pxor IV, STATE
call _aesni_enc1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_enc_out
-
+.else
add $16, INP
sub $16, LEN
- jl .Lxts_enc_cts1
-
- movdqu STATE, (OUTP)
- add $16, OUTP
- jmp .Lxts_enc_loop1
-
-.Lxts_enc_out:
- movdqu STATE, (OUTP)
- jmp .Lxts_enc_ret_iv
-
-.Lxts_enc_cts4:
- movdqa STATE4, STATE
- sub $16, OUTP
-
-.Lxts_enc_cts1:
-#ifndef __x86_64__
- lea .Lcts_permute_table, T1
-#else
- lea .Lcts_permute_table(%rip), T1
-#endif
- add LEN, INP /* rewind input pointer */
- add $16, LEN /* # bytes in final block */
- movups (INP), IN1
-
- mov T1, IVP
- add $32, IVP
- add LEN, T1
- sub LEN, IVP
- add OUTP, LEN
-
- movups (T1), %xmm4
- movaps STATE, IN2
- pshufb %xmm4, STATE
- movups STATE, (LEN)
-
- movups (IVP), %xmm0
- pshufb %xmm0, IN1
- pblendvb IN2, IN1
- movaps IN1, STATE
-
+ jl .Lxts_cts1\@
pxor IV, STATE
- call _aesni_enc1
+ call _aesni_dec1
+.endif
pxor IV, STATE
+ _aesni_gf128mul_x_ble
- movups STATE, (OUTP)
- jmp .Lxts_enc_ret
-SYM_FUNC_END(aesni_xts_encrypt)
-
-/*
- * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, unsigned int len, le128 *iv)
- */
-SYM_FUNC_START(aesni_xts_decrypt)
- FRAME_BEGIN
-#ifndef __x86_64__
- pushl IVP
- pushl LEN
- pushl KEYP
- pushl KLEN
- movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
- movl (FRAME_OFFSET+24)(%esp), OUTP # dst
- movl (FRAME_OFFSET+28)(%esp), INP # src
- movl (FRAME_OFFSET+32)(%esp), LEN # len
- movl (FRAME_OFFSET+36)(%esp), IVP # iv
- movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
-#else
- movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
-#endif
- movups (IVP), IV
-
- mov 480(KEYP), KLEN
- add $240, KEYP
-
- test $15, LEN
- jz .Lxts_dec_loop4
- sub $16, LEN
-
-.Lxts_dec_loop4:
- sub $64, LEN
- jl .Lxts_dec_1x
-
- movdqa IV, STATE1
- movdqu 0x00(INP), IN
- pxor IN, STATE1
- movdqu IV, 0x00(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x10(INP), IN
- pxor IN, STATE2
- movdqu IV, 0x10(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x20(INP), IN
- pxor IN, STATE3
- movdqu IV, 0x20(OUTP)
-
- _aesni_gf128mul_x_ble()
- movdqa IV, STATE4
- movdqu 0x30(INP), IN
- pxor IN, STATE4
- movdqu IV, 0x30(OUTP)
-
- call _aesni_dec4
-
- movdqu 0x00(OUTP), IN
- pxor IN, STATE1
- movdqu STATE1, 0x00(OUTP)
-
- movdqu 0x10(OUTP), IN
- pxor IN, STATE2
- movdqu STATE2, 0x10(OUTP)
-
- movdqu 0x20(OUTP), IN
- pxor IN, STATE3
- movdqu STATE3, 0x20(OUTP)
-
- movdqu 0x30(OUTP), IN
- pxor IN, STATE4
- movdqu STATE4, 0x30(OUTP)
-
- _aesni_gf128mul_x_ble()
-
- add $64, INP
- add $64, OUTP
test LEN, LEN
- jnz .Lxts_dec_loop4
-
-.Lxts_dec_ret_iv:
- movups IV, (IVP)
-
-.Lxts_dec_ret:
-#ifndef __x86_64__
- popl KLEN
- popl KEYP
- popl LEN
- popl IVP
-#endif
- FRAME_END
- RET
-
-.Lxts_dec_1x:
- add $64, LEN
- jz .Lxts_dec_ret_iv
-
-.Lxts_dec_loop1:
- movdqu (INP), STATE
+ jz .Lxts_out\@
+.if \enc
add $16, INP
sub $16, LEN
- jl .Lxts_dec_cts1
-
- pxor IV, STATE
- call _aesni_dec1
- pxor IV, STATE
- _aesni_gf128mul_x_ble()
-
- test LEN, LEN
- jz .Lxts_dec_out
+ jl .Lxts_cts1\@
+.endif
movdqu STATE, (OUTP)
add $16, OUTP
- jmp .Lxts_dec_loop1
+ jmp .Lxts_loop1\@
-.Lxts_dec_out:
+.Lxts_out\@:
movdqu STATE, (OUTP)
- jmp .Lxts_dec_ret_iv
+ jmp .Lxts_ret_iv\@
-.Lxts_dec_cts1:
+.if \enc
+.Lxts_cts4\@:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+.Lxts_cts1\@:
+.else
+.Lxts_cts1\@:
movdqa IV, STATE4
- _aesni_gf128mul_x_ble()
+ _aesni_gf128mul_x_ble
pxor IV, STATE
call _aesni_dec1
pxor IV, STATE
-
+.endif
#ifndef __x86_64__
lea .Lcts_permute_table, T1
#else
@@ -3152,10 +2831,32 @@ SYM_FUNC_START(aesni_xts_decrypt)
pblendvb IN2, IN1
movaps IN1, STATE
+.if \enc
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+.else
pxor STATE4, STATE
call _aesni_dec1
pxor STATE4, STATE
+.endif
movups STATE, (OUTP)
- jmp .Lxts_dec_ret
-SYM_FUNC_END(aesni_xts_decrypt)
+ jmp .Lxts_ret\@
+.endm
+
+/*
+ * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_enc)
+ _aesni_xts_crypt 1
+SYM_FUNC_END(aesni_xts_enc)
+
+/*
+ * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_dec)
+ _aesni_xts_crypt 0
+SYM_FUNC_END(aesni_xts_dec)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index b1d90c25975a..5b25d2a58aeb 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -40,7 +40,6 @@
#define AESNI_ALIGN 16
#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
@@ -87,8 +86,8 @@ static inline void *aes_align_addr(void *addr)
return PTR_ALIGN(addr, AESNI_ALIGN);
}
-asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
- unsigned int key_len);
+asmlinkage void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
+ unsigned int key_len);
asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
@@ -107,11 +106,11 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
-asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#ifdef CONFIG_X86_64
@@ -233,19 +232,17 @@ static int aes_set_key_common(struct crypto_aes_ctx *ctx,
{
int err;
- if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256)
- return -EINVAL;
-
if (!crypto_simd_usable())
- err = aes_expandkey(ctx, in_key, key_len);
- else {
- kernel_fpu_begin();
- err = aesni_set_key(ctx, in_key, key_len);
- kernel_fpu_end();
- }
+ return aes_expandkey(ctx, in_key, key_len);
- return err;
+ err = aes_check_keylen(key_len);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_set_key(ctx, in_key, key_len);
+ kernel_fpu_end();
+ return 0;
}
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
@@ -592,23 +589,12 @@ static int xctr_crypt(struct skcipher_request *req)
return err;
}
-static int
-rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key,
+ u8 hash_subkey[AES_BLOCK_SIZE])
{
- struct crypto_aes_ctx ctx;
- int ret;
-
- ret = aes_expandkey(&ctx, key, key_len);
- if (ret)
- return ret;
-
- /* Clear the data in the hash sub key container to zero.*/
- /* We want to cipher all zeros to create the hash sub key. */
- memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
-
- aes_encrypt(&ctx, hash_subkey, hash_subkey);
+ static const u8 zeroes[AES_BLOCK_SIZE];
- memzero_explicit(&ctx, sizeof(ctx));
+ aes_encrypt(aes_key, hash_subkey, zeroes);
return 0;
}
@@ -626,7 +612,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+ aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
+ ctx->hash_subkey);
}
/* This is the Integrity Check Value (aka the authentication tag) length and can
@@ -877,7 +864,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
}
#endif
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
@@ -898,108 +885,149 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
return aes_set_key_common(&ctx->tweak_ctx, key + keylen, keylen);
}
-static int xts_crypt(struct skcipher_request *req, bool encrypt)
+typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE]);
+
+/* This handles cases where the source and/or destination span pages. */
+static noinline int
+xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct skcipher_walk walk;
+ struct scatterlist *src, *dst;
int err;
- if (req->cryptlen < AES_BLOCK_SIZE)
- return -EINVAL;
-
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
-
- if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
- int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
-
- skcipher_walk_abort(&walk);
-
+ /*
+ * If the message length isn't divisible by the AES block size, then
+ * separate off the last full block and the partial block. This ensures
+ * that they are processed in the same call to the assembly function,
+ * which is required for ciphertext stealing.
+ */
+ if (tail) {
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
- blocks * AES_BLOCK_SIZE, req->iv);
+ req->cryptlen - tail - AES_BLOCK_SIZE,
+ req->iv);
req = &subreq;
+ }
- err = skcipher_walk_virt(&walk, req, false);
- if (!walk.nbytes)
- return err;
- } else {
- tail = 0;
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while (walk.nbytes) {
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx,
+ walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes & ~(AES_BLOCK_SIZE - 1), req->iv);
+ kernel_fpu_end();
+ err = skcipher_walk_done(&walk,
+ walk.nbytes & (AES_BLOCK_SIZE - 1));
}
- kernel_fpu_begin();
+ if (err || !tail)
+ return err;
- /* calculate first value of T */
- aesni_enc(&ctx->tweak_ctx, walk.iv, walk.iv);
+ /* Do ciphertext stealing with the last full block and partial block. */
- while (walk.nbytes > 0) {
- int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes &= ~(AES_BLOCK_SIZE - 1);
-
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, walk.iv);
- kernel_fpu_end();
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
- if (walk.nbytes > 0)
- kernel_fpu_begin();
- }
+ err = skcipher_walk_virt(&walk, req, false);
+ if (err)
+ return err;
- if (unlikely(tail > 0 && !err)) {
- struct scatterlist sg_src[2], sg_dst[2];
- struct scatterlist *src, *dst;
+ kernel_fpu_begin();
+ (*crypt_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
+ walk.nbytes, req->iv);
+ kernel_fpu_end();
- dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
- if (req->dst != req->src)
- dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+ return skcipher_walk_done(&walk, 0);
+}
- skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
- req->iv);
+/* __always_inline to avoid indirect call in fastpath */
+static __always_inline int
+xts_crypt(struct skcipher_request *req, xts_encrypt_iv_func encrypt_iv,
+ xts_crypt_func crypt_func)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
+ const unsigned int cryptlen = req->cryptlen;
+ struct scatterlist *src = req->src;
+ struct scatterlist *dst = req->dst;
- err = skcipher_walk_virt(&walk, &subreq, false);
- if (err)
- return err;
+ if (unlikely(cryptlen < AES_BLOCK_SIZE))
+ return -EINVAL;
- kernel_fpu_begin();
- if (encrypt)
- aesni_xts_encrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
- else
- aesni_xts_decrypt(&ctx->crypt_ctx,
- walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes, walk.iv);
- kernel_fpu_end();
+ kernel_fpu_begin();
+ (*encrypt_iv)(&ctx->tweak_ctx, req->iv);
- err = skcipher_walk_done(&walk, 0);
+ /*
+ * In practice, virtually all XTS plaintexts and ciphertexts are either
+ * 512 or 4096 bytes, aligned such that they don't span page boundaries.
+ * To optimize the performance of these cases, and also any other case
+ * where no page boundary is spanned, the below fast-path handles
+ * single-page sources and destinations as efficiently as possible.
+ */
+ if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
+ src->offset + cryptlen <= PAGE_SIZE &&
+ dst->offset + cryptlen <= PAGE_SIZE)) {
+ struct page *src_page = sg_page(src);
+ struct page *dst_page = sg_page(dst);
+ void *src_virt = kmap_local_page(src_page) + src->offset;
+ void *dst_virt = kmap_local_page(dst_page) + dst->offset;
+
+ (*crypt_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
+ req->iv);
+ kunmap_local(dst_virt);
+ kunmap_local(src_virt);
+ kernel_fpu_end();
+ return 0;
}
- return err;
+ kernel_fpu_end();
+ return xts_crypt_slowpath(req, crypt_func);
+}
+
+static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE])
+{
+ aesni_enc(tweak_key, iv, iv);
+}
+
+static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
+{
+ aesni_xts_enc(key, dst, src, len, tweak);
}
-static int xts_encrypt(struct skcipher_request *req)
+static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
+ const u8 *src, u8 *dst, unsigned int len,
+ u8 tweak[AES_BLOCK_SIZE])
{
- return xts_crypt(req, true);
+ aesni_xts_dec(key, dst, src, len, tweak);
}
-static int xts_decrypt(struct skcipher_request *req)
+static int xts_encrypt_aesni(struct skcipher_request *req)
{
- return xts_crypt(req, false);
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_encrypt);
+}
+
+static int xts_decrypt_aesni(struct skcipher_request *req)
+{
+ return xts_crypt(req, aesni_xts_encrypt_iv, aesni_xts_decrypt);
}
static struct crypto_alg aesni_cipher_alg = {
@@ -1103,9 +1131,9 @@ static struct skcipher_alg aesni_skciphers[] = {
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
- .setkey = xts_aesni_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
+ .setkey = xts_setkey_aesni,
+ .encrypt = xts_encrypt_aesni,
+ .decrypt = xts_decrypt_aesni,
}
};
@@ -1137,7 +1165,149 @@ static struct skcipher_alg aesni_xctr = {
};
static struct simd_skcipher_alg *aesni_simd_xctr;
-#endif /* CONFIG_X86_64 */
+
+asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
+ u8 iv[AES_BLOCK_SIZE]);
+
+#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
+ \
+asmlinkage void \
+aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+asmlinkage void \
+aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
+ u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
+ \
+static int xts_encrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_encrypt_##suffix); \
+} \
+ \
+static int xts_decrypt_##suffix(struct skcipher_request *req) \
+{ \
+ return xts_crypt(req, aes_xts_encrypt_iv, aes_xts_decrypt_##suffix); \
+} \
+ \
+static struct skcipher_alg aes_xts_alg_##suffix = { \
+ .base = { \
+ .cra_name = "__xts(aes)", \
+ .cra_driver_name = "__" driver_name, \
+ .cra_priority = priority, \
+ .cra_flags = CRYPTO_ALG_INTERNAL, \
+ .cra_blocksize = AES_BLOCK_SIZE, \
+ .cra_ctxsize = XTS_AES_CTX_SIZE, \
+ .cra_module = THIS_MODULE, \
+ }, \
+ .min_keysize = 2 * AES_MIN_KEY_SIZE, \
+ .max_keysize = 2 * AES_MAX_KEY_SIZE, \
+ .ivsize = AES_BLOCK_SIZE, \
+ .walksize = 2 * AES_BLOCK_SIZE, \
+ .setkey = xts_setkey_aesni, \
+ .encrypt = xts_encrypt_##suffix, \
+ .decrypt = xts_decrypt_##suffix, \
+}; \
+ \
+static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
+
+DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
+DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
+DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
+#endif
+
+/*
+ * This is a list of CPU models that are known to suffer from downclocking when
+ * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS
+ * implementation with zmm registers won't be used by default. An
+ * implementation with ymm registers (256-bit vectors) will be used instead.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L },
+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE },
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
+ {},
+};
+
+static int __init register_xts_algs(void)
+{
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+ if (err)
+ return err;
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_VAES) ||
+ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
+ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ return 0;
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (err)
+ return err;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_BMI2) ||
+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ return 0;
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (err)
+ return err;
+
+ if (x86_match_cpu(zmm_exclusion_list))
+ aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+
+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+ if (err)
+ return err;
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+ return 0;
+}
+
+static void unregister_xts_algs(void)
+{
+ if (aes_xts_simdalg_aesni_avx)
+ simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
+ &aes_xts_simdalg_aesni_avx);
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+ if (aes_xts_simdalg_vaes_avx2)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
+ &aes_xts_simdalg_vaes_avx2);
+ if (aes_xts_simdalg_vaes_avx10_256)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
+ &aes_xts_simdalg_vaes_avx10_256);
+ if (aes_xts_simdalg_vaes_avx10_512)
+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
+ &aes_xts_simdalg_vaes_avx10_512);
+#endif
+}
+#else /* CONFIG_X86_64 */
+static int __init register_xts_algs(void)
+{
+ return 0;
+}
+
+static void unregister_xts_algs(void)
+{
+}
+#endif /* !CONFIG_X86_64 */
#ifdef CONFIG_X86_64
static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
@@ -1146,7 +1316,8 @@ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
- rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+ aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
+ ctx->hash_subkey);
}
static int generic_gcmaes_encrypt(struct aead_request *req)
@@ -1276,13 +1447,21 @@ static int __init aesni_init(void)
goto unregister_aeads;
#endif /* CONFIG_X86_64 */
+ err = register_xts_algs();
+ if (err)
+ goto unregister_xts;
+
return 0;
+unregister_xts:
+ unregister_xts_algs();
#ifdef CONFIG_X86_64
+ if (aesni_simd_xctr)
+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
unregister_aeads:
+#endif /* CONFIG_X86_64 */
simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
aesni_simd_aeads);
-#endif /* CONFIG_X86_64 */
unregister_skciphers:
simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
@@ -1303,6 +1482,7 @@ static void __exit aesni_exit(void)
if (boot_cpu_has(X86_FEATURE_AVX))
simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
#endif /* CONFIG_X86_64 */
+ unregister_xts_algs();
}
late_initcall(aesni_init);
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index ef73a3ab8726..791386d9a83a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -154,5 +154,6 @@ SYM_TYPED_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
+ vzeroupper
RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9918212faf91..0ffb072be956 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -716,6 +716,7 @@ SYM_TYPED_FUNC_START(sha256_transform_rorx)
popq %r13
popq %r12
popq %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha256_transform_rorx)
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 537b6dcd7ed8..d515a55a3bc1 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -62,20 +62,41 @@
#define SHA256CONSTANTS %rax
-#define MSG %xmm0
+#define MSG %xmm0 /* sha256rnds2 implicit operand */
#define STATE0 %xmm1
#define STATE1 %xmm2
-#define MSGTMP0 %xmm3
-#define MSGTMP1 %xmm4
-#define MSGTMP2 %xmm5
-#define MSGTMP3 %xmm6
-#define MSGTMP4 %xmm7
+#define MSG0 %xmm3
+#define MSG1 %xmm4
+#define MSG2 %xmm5
+#define MSG3 %xmm6
+#define TMP %xmm7
#define SHUF_MASK %xmm8
#define ABEF_SAVE %xmm9
#define CDGH_SAVE %xmm10
+.macro do_4rounds i, m0, m1, m2, m3
+.if \i < 16
+ movdqu \i*4(DATA_PTR), \m0
+ pshufb SHUF_MASK, \m0
+.endif
+ movdqa (\i-32)*4(SHA256CONSTANTS), MSG
+ paddd \m0, MSG
+ sha256rnds2 STATE0, STATE1
+.if \i >= 12 && \i < 60
+ movdqa \m0, TMP
+ palignr $4, \m3, TMP
+ paddd TMP, \m1
+ sha256msg2 \m0, \m1
+.endif
+ punpckhqdq MSG, MSG
+ sha256rnds2 STATE1, STATE0
+.if \i >= 4 && \i < 52
+ sha256msg1 \m0, \m3
+.endif
+.endm
+
/*
* Intel SHA Extensions optimized implementation of a SHA-256 update function
*
@@ -86,9 +107,6 @@
* store partial blocks. All message padding and hash value initialization must
* be done outside the update function.
*
- * The indented lines in the loop are instructions related to rounds processing.
- * The non-indented lines are instructions related to the message schedule.
- *
* void sha256_ni_transform(uint32_t *digest, const void *data,
uint32_t numBlocks);
* digest : pointer to digest
@@ -108,202 +126,29 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
* Need to reorder these appropriately
* DCBA, HGFE -> ABEF, CDGH
*/
- movdqu 0*16(DIGEST_PTR), STATE0
- movdqu 1*16(DIGEST_PTR), STATE1
+ movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
+ movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */
- pshufd $0xB1, STATE0, STATE0 /* CDAB */
- pshufd $0x1B, STATE1, STATE1 /* EFGH */
- movdqa STATE0, MSGTMP4
- palignr $8, STATE1, STATE0 /* ABEF */
- pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* FEBA */
+ punpckhqdq TMP, STATE1 /* DCHG */
+ pshufd $0x1B, STATE0, STATE0 /* ABEF */
+ pshufd $0xB1, STATE1, STATE1 /* CDGH */
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256(%rip), SHA256CONSTANTS
+ lea K256+32*4(%rip), SHA256CONSTANTS
.Lloop0:
/* Save hash values for addition after rounds */
movdqa STATE0, ABEF_SAVE
movdqa STATE1, CDGH_SAVE
- /* Rounds 0-3 */
- movdqu 0*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP0
- paddd 0*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 4-7 */
- movdqu 1*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP1
- paddd 1*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 8-11 */
- movdqu 2*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP2
- paddd 2*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 12-15 */
- movdqu 3*16(DATA_PTR), MSG
- pshufb SHUF_MASK, MSG
- movdqa MSG, MSGTMP3
- paddd 3*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 16-19 */
- movdqa MSGTMP0, MSG
- paddd 4*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 20-23 */
- movdqa MSGTMP1, MSG
- paddd 5*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 24-27 */
- movdqa MSGTMP2, MSG
- paddd 6*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 28-31 */
- movdqa MSGTMP3, MSG
- paddd 7*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 32-35 */
- movdqa MSGTMP0, MSG
- paddd 8*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 36-39 */
- movdqa MSGTMP1, MSG
- paddd 9*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP1, MSGTMP0
-
- /* Rounds 40-43 */
- movdqa MSGTMP2, MSG
- paddd 10*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP2, MSGTMP1
-
- /* Rounds 44-47 */
- movdqa MSGTMP3, MSG
- paddd 11*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP3, MSGTMP4
- palignr $4, MSGTMP2, MSGTMP4
- paddd MSGTMP4, MSGTMP0
- sha256msg2 MSGTMP3, MSGTMP0
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP3, MSGTMP2
-
- /* Rounds 48-51 */
- movdqa MSGTMP0, MSG
- paddd 12*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP0, MSGTMP4
- palignr $4, MSGTMP3, MSGTMP4
- paddd MSGTMP4, MSGTMP1
- sha256msg2 MSGTMP0, MSGTMP1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
- sha256msg1 MSGTMP0, MSGTMP3
-
- /* Rounds 52-55 */
- movdqa MSGTMP1, MSG
- paddd 13*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP1, MSGTMP4
- palignr $4, MSGTMP0, MSGTMP4
- paddd MSGTMP4, MSGTMP2
- sha256msg2 MSGTMP1, MSGTMP2
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 56-59 */
- movdqa MSGTMP2, MSG
- paddd 14*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- movdqa MSGTMP2, MSGTMP4
- palignr $4, MSGTMP1, MSGTMP4
- paddd MSGTMP4, MSGTMP3
- sha256msg2 MSGTMP2, MSGTMP3
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
-
- /* Rounds 60-63 */
- movdqa MSGTMP3, MSG
- paddd 15*16(SHA256CONSTANTS), MSG
- sha256rnds2 STATE0, STATE1
- pshufd $0x0E, MSG, MSG
- sha256rnds2 STATE1, STATE0
+.irp i, 0, 16, 32, 48
+ do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
+ do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
+ do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
+ do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
+.endr
/* Add current hash values with previously saved */
paddd ABEF_SAVE, STATE0
@@ -315,14 +160,14 @@ SYM_TYPED_FUNC_START(sha256_ni_transform)
jne .Lloop0
/* Write hash values back in the correct order */
- pshufd $0x1B, STATE0, STATE0 /* FEBA */
- pshufd $0xB1, STATE1, STATE1 /* DCHG */
- movdqa STATE0, MSGTMP4
- pblendw $0xF0, STATE1, STATE0 /* DCBA */
- palignr $8, MSGTMP4, STATE1 /* HGFE */
-
- movdqu STATE0, 0*16(DIGEST_PTR)
- movdqu STATE1, 1*16(DIGEST_PTR)
+ movdqa STATE0, TMP
+ punpcklqdq STATE1, STATE0 /* GHEF */
+ punpckhqdq TMP, STATE1 /* ABCD */
+ pshufd $0xB1, STATE0, STATE0 /* HGFE */
+ pshufd $0x1B, STATE1, STATE1 /* DCBA */
+
+ movdqu STATE1, 0*16(DIGEST_PTR)
+ movdqu STATE0, 1*16(DIGEST_PTR)
.Ldone_hash:
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index f08496cd6870..24973f42c43f 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -680,6 +680,7 @@ SYM_TYPED_FUNC_START(sha512_transform_rorx)
pop %r12
pop %rbx
+ vzeroupper
RET
SYM_FUNC_END(sha512_transform_rorx)
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index c93e7f5c2a06..ce1cc1622385 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -17,7 +17,7 @@ obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
-obj-$(CONFIG_PREEMPTION) += thunk_$(BITS).o
+obj-$(CONFIG_PREEMPTION) += thunk.o
CFLAGS_entry_fred.o += -fno-stack-protector
CFLAGS_REMOVE_entry_fred.o += -pg $(CC_FLAGS_FTRACE)
obj-$(CONFIG_X86_FRED) += entry_64_fred.o entry_fred.o
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6356060caaf3..51cc9c7cb9bd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -49,7 +49,7 @@ static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
- regs->ax = sys_call_table[unr](regs);
+ regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
@@ -66,7 +66,7 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
xnr = array_index_nospec(xnr, X32_NR_syscalls);
- regs->ax = x32_sys_call_table[xnr](regs);
+ regs->ax = x32_sys_call(regs, xnr);
return true;
}
return false;
@@ -162,7 +162,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
if (likely(unr < IA32_NR_syscalls)) {
unr = array_index_nospec(unr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[unr](regs);
+ regs->ax = ia32_sys_call(regs, unr);
} else if (nr != -1) {
regs->ax = __ia32_sys_ni_syscall(regs);
}
@@ -189,7 +189,7 @@ static __always_inline bool int80_is_external(void)
}
/**
- * int80_emulation - 32-bit legacy syscall entry
+ * do_int80_emulation - 32-bit legacy syscall C entry from asm
*
* This entry point can be used by 32-bit and 64-bit programs to perform
* 32-bit system calls. Instances of INT $0x80 can be found inline in
@@ -207,7 +207,7 @@ static __always_inline bool int80_is_external(void)
* eax: system call number
* ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
*/
-DEFINE_IDTENTRY_RAW(int80_emulation)
+__visible noinstr void do_int80_emulation(struct pt_regs *regs)
{
int nr;
@@ -255,6 +255,71 @@ DEFINE_IDTENTRY_RAW(int80_emulation)
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * A FRED-specific INT80 handler is warranted for the follwing reasons:
+ *
+ * 1) As INT instructions and hardware interrupts are separate event
+ * types, FRED does not preclude the use of vector 0x80 for external
+ * interrupts. As a result, the FRED setup code does not reserve
+ * vector 0x80 and calling int80_is_external() is not merely
+ * suboptimal but actively incorrect: it could cause a system call
+ * to be incorrectly ignored.
+ *
+ * 2) It is called only for handling vector 0x80 of event type
+ * EVENT_TYPE_SWINT and will never be called to handle any external
+ * interrupt (event type EVENT_TYPE_EXTINT).
+ *
+ * 3) FRED has separate entry flows depending on if the event came from
+ * user space or kernel space, and because the kernel does not use
+ * INT insns, the FRED kernel entry handler fred_entry_from_kernel()
+ * falls through to fred_bad_type() if the event type is
+ * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
+ * an INT insn, it can only be from a user level.
+ *
+ * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
+ * likely take a different approach if it is ever needed: it
+ * probably belongs in either fred_intx()/ fred_other() or
+ * asm_fred_entrypoint_user(), depending on if this ought to be done
+ * for all entries from userspace or only system
+ * calls.
+ *
+ * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
+ */
+DEFINE_FREDENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /*
+ * FRED pushed 0 into regs::orig_ax and regs::ax contains the
+ * syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#endif
#else /* CONFIG_IA32_EMULATION */
/* Handles int $0x80 on a 32bit kernel */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8af2a26b24f6..1b5be07f8669 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -116,6 +116,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
@@ -1491,3 +1492,63 @@ SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
call make_task_dead
SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
+
+/*
+ * This sequence executes branches in order to remove user branch information
+ * from the branch history tracker in the Branch Predictor, therefore removing
+ * user influence on subsequent BTB lookups.
+ *
+ * It should be used on parts prior to Alder Lake. Newer parts should use the
+ * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
+ * virtualized on newer hardware the VMM should protect against BHI attacks by
+ * setting BHI_DIS_S for the guests.
+ *
+ * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
+ * and not clearing the branch history. The call tree looks like:
+ *
+ * call 1
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * call 2
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ * ret
+ *
+ * This means that the stack is non-constant and ORC can't unwind it with %rsp
+ * alone. Therefore we unconditionally set up the frame pointer, which allows
+ * ORC to unwind properly.
+ *
+ * The alignment is for performance and not for safety, and may be safely
+ * refactored in the future if needed.
+ */
+SYM_FUNC_START(clear_bhb_loop)
+ push %rbp
+ mov %rsp, %rbp
+ movl $5, %ecx
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 1f
+ jmp 5f
+ .align 64, 0xcc
+ ANNOTATE_INTRA_FUNCTION_CALL
+1: call 2f
+ RET
+ .align 64, 0xcc
+2: movl $5, %eax
+3: jmp 4f
+ nop
+4: sub $1, %eax
+ jnz 3b
+ sub $1, %ecx
+ jnz 1b
+ RET
+5: lfence
+ pop %rbp
+ RET
+SYM_FUNC_END(clear_bhb_loop)
+EXPORT_SYMBOL_GPL(clear_bhb_loop)
+STACK_FRAME_NON_STANDARD(clear_bhb_loop)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index eabf48c4d4b4..11c9b8efdc4c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -7,7 +7,6 @@
#include <asm/asm-offsets.h>
#include <asm/current.h>
#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/irqflags.h>
@@ -92,6 +91,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
/*
* SYSENTER doesn't filter flags, so we need to clear NT and AC
@@ -206,6 +206,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
IBRS_ENTER
UNTRAIN_RET
+ CLEAR_BRANCH_HISTORY
movq %rsp, %rdi
call do_fast_syscall_32
@@ -276,3 +277,17 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_compat)
+
+/*
+ * int 0x80 is used by 32 bit mode as a system call entry. Normally idt entries
+ * point to C routines, however since this is a system call interface the branch
+ * history needs to be scrubbed to protect against BHI attacks, and that
+ * scrubbing needs to take place in assembly code prior to entering any C
+ * routines.
+ */
+SYM_CODE_START(int80_emulation)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ CLEAR_BRANCH_HISTORY
+ jmp do_int80_emulation
+SYM_CODE_END(int80_emulation)
diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c
index ac120cbdaaf2..f004a4dc74c2 100644
--- a/arch/x86/entry/entry_fred.c
+++ b/arch/x86/entry/entry_fred.c
@@ -28,9 +28,9 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code
if (regs->fred_cs.sl > 0) {
pr_emerg("PANIC: invalid or fatal FRED event; event type %u "
"vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
- regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
fred_event_data(regs), regs->cs, regs->ip);
- die("invalid or fatal FRED event", regs, regs->orig_ax);
+ die("invalid or fatal FRED event", regs, error_code);
panic("invalid or fatal FRED event");
} else {
unsigned long flags = oops_begin();
@@ -38,10 +38,10 @@ static noinstr void fred_bad_type(struct pt_regs *regs, unsigned long error_code
pr_alert("BUG: invalid or fatal FRED event; event type %u "
"vector %u error 0x%lx aux 0x%lx at %04x:%016lx\n",
- regs->fred_ss.type, regs->fred_ss.vector, regs->orig_ax,
+ regs->fred_ss.type, regs->fred_ss.vector, error_code,
fred_event_data(regs), regs->cs, regs->ip);
- if (__die("Invalid or fatal FRED event", regs, regs->orig_ax))
+ if (__die("Invalid or fatal FRED event", regs, error_code))
sig = 0;
oops_end(flags, regs, sig);
@@ -66,7 +66,7 @@ static noinstr void fred_intx(struct pt_regs *regs)
/* INT80 */
case IA32_SYSCALL_VECTOR:
if (ia32_enabled())
- return int80_emulation(regs);
+ return fred_int80_emulation(regs);
fallthrough;
#endif
@@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
+
+ SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
};
static bool fred_setup_done __initdata;
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8cfc9bc73e7f..c2235bae17ef 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -18,8 +18,25 @@
#include <asm/syscalls_32.h>
#undef __SYSCALL
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
+#ifdef CONFIG_X86_32
#define __SYSCALL(nr, sym) __ia32_##sym,
-
-__visible const sys_call_ptr_t ia32_sys_call_table[] = {
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_32.h>
};
+#undef __SYSCALL
+#endif
+
+#define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
+
+long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_32.h>
+ default: return __ia32_sys_ni_syscall(regs);
+ }
+};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index be120eec1fc9..33b3f09e6f15 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -11,8 +11,23 @@
#include <asm/syscalls_64.h>
#undef __SYSCALL
+/*
+ * The sys_call_table[] is no longer used for system calls, but
+ * kernel/trace/trace_syscalls.c still wants to know the system
+ * call address.
+ */
#define __SYSCALL(nr, sym) __x64_##sym,
-
-asmlinkage const sys_call_ptr_t sys_call_table[] = {
+const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
+#undef __SYSCALL
+
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
+
+long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_64.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
+};
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index bdd0e03a1265..03de4a932131 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -11,8 +11,12 @@
#include <asm/syscalls_x32.h>
#undef __SYSCALL
-#define __SYSCALL(nr, sym) __x64_##sym,
+#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
-asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
-#include <asm/syscalls_x32.h>
+long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
+{
+ switch (nr) {
+ #include <asm/syscalls_x32.h>
+ default: return __x64_sys_ni_syscall(regs);
+ }
};
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7e8d46f4147f..cc78226ffc35 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,7 +374,7 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
-453 64 map_shadow_stack sys_map_shadow_stack
+453 common map_shadow_stack sys_map_shadow_stack
454 common futex_wake sys_futex_wake
455 common futex_wait sys_futex_wait
456 common futex_requeue sys_futex_requeue
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk.S
index 119ebdc3d362..119ebdc3d362 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk.S
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
deleted file mode 100644
index da37f42f4549..000000000000
--- a/arch/x86/entry/thunk_32.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
- * Copyright 2008 by Steven Rostedt, Red Hat, Inc
- * (inspired by Andi Kleen's thunk_64.S)
- */
-
-#include <linux/export.h>
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-#include "calling.h"
-
-THUNK preempt_schedule_thunk, preempt_schedule
-THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
-EXPORT_SYMBOL(preempt_schedule_thunk)
-EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 620f6257bbe9..3d64bcc403cf 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -9,7 +9,9 @@ include $(srctree)/lib/vdso/Makefile
# Sanitizer runtimes are unavailable and cannot be linked here.
KASAN_SANITIZE := n
KMSAN_SANITIZE_vclock_gettime.o := n
+KMSAN_SANITIZE_vdso32/vclock_gettime.o := n
KMSAN_SANITIZE_vgetcpu.o := n
+KMSAN_SANITIZE_vdso32/vgetcpu.o := n
UBSAN_SANITIZE := n
KCSAN_SANITIZE := n
@@ -39,6 +41,7 @@ obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o
obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o
OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-x32.o := n
OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n
OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index a3c0df11d0e6..2fb7d53cf333 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -98,11 +98,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
- /*
- * XXX: if access_ok, get_user, and put_user handled
- * sig_on_uaccess_err, this could go away.
- */
-
if (!access_ok((void __user *)ptr, size)) {
struct thread_struct *thread = &current->thread;
@@ -120,10 +115,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
bool emulate_vsyscall(unsigned long error_code,
struct pt_regs *regs, unsigned long address)
{
- struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr, syscall_nr, tmp;
- int prev_sig_on_uaccess_err;
long ret;
unsigned long orig_dx;
@@ -172,8 +165,6 @@ bool emulate_vsyscall(unsigned long error_code,
goto sigsegv;
}
- tsk = current;
-
/*
* Check for access_ok violations and find the syscall nr.
*
@@ -234,12 +225,8 @@ bool emulate_vsyscall(unsigned long error_code,
goto do_ret; /* skip requested */
/*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
+ * With a real vsyscall, page faults cause SIGSEGV.
*/
- prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err;
- current->thread.sig_on_uaccess_err = 1;
-
ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
@@ -262,23 +249,12 @@ bool emulate_vsyscall(unsigned long error_code,
break;
}
- current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err;
-
check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
-
- /*
- * If we failed to generate a signal for any reason,
- * generate one here. (This should be impossible.)
- */
- if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
- !sigismember(&tsk->pending.signal, SIGSEGV)))
- goto sigsegv;
-
- return true; /* Don't emulate the ret. */
+ goto sigsegv;
}
regs->ax = ret;
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 69a3b02e50bb..1fc4ce44e743 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -250,7 +250,7 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] =
/*
* AMD Performance Monitor Family 17h and later:
*/
-static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
+static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -262,10 +262,39 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187,
};
+static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9,
+};
+
+static const u64 amd_zen4_perfmon_event_map[PERF_COUNT_HW_MAX] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
+ [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x100000120,
+};
+
static u64 amd_pmu_event_map(int hw_event)
{
- if (boot_cpu_data.x86 >= 0x17)
- return amd_f17h_perfmon_event_map[hw_event];
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4) || boot_cpu_data.x86 >= 0x1a)
+ return amd_zen4_perfmon_event_map[hw_event];
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19)
+ return amd_zen2_perfmon_event_map[hw_event];
+
+ if (cpu_feature_enabled(X86_FEATURE_ZEN1))
+ return amd_zen1_perfmon_event_map[hw_event];
return amd_perfmon_event_map[hw_event];
}
@@ -604,7 +633,6 @@ static void amd_pmu_cpu_dead(int cpu)
kfree(cpuhw->lbr_sel);
cpuhw->lbr_sel = NULL;
- amd_pmu_cpu_reset(cpu);
if (!x86_pmu.amd_nb_constraints)
return;
@@ -619,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu)
}
}
-static inline void amd_pmu_set_global_ctl(u64 ctl)
+static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
@@ -879,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled);
}
+/*
+ * AMD-specific callback invoked through perf_snapshot_branch_stack static
+ * call, defined in include/linux/perf_event.h. See its definition for API
+ * details. It's up to caller to provide enough space in *entries* to fit all
+ * LBR records, otherwise returned result will be truncated to *cnt* entries.
+ */
+static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
+{
+ struct cpu_hw_events *cpuc;
+ unsigned long flags;
+
+ /*
+ * The sequence of steps to freeze LBR should be completely inlined
+ * and contain no branches to minimize contamination of LBR snapshot
+ */
+ local_irq_save(flags);
+ amd_pmu_core_disable_all();
+ __amd_pmu_lbr_disable();
+
+ cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ amd_pmu_lbr_read();
+ cnt = min(cnt, x86_pmu.lbr_nr);
+ memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
+
+ amd_pmu_v2_enable_all(0);
+ local_irq_restore(flags);
+
+ return cnt;
+}
+
static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -905,8 +964,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!status)
goto done;
- /* Read branch records before unfreezing */
- if (status & GLOBAL_STATUS_LBRS_FROZEN) {
+ /* Read branch records */
+ if (x86_pmu.lbr_nr) {
amd_pmu_lbr_read();
status &= ~GLOBAL_STATUS_LBRS_FROZEN;
}
@@ -1415,6 +1474,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+
+ /* Only support branch_stack snapshot on perfmon v2 */
+ if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
+ static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) {
/*
* BRS requires special event constraints and flushing on ctxsw.
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index eb31f850841a..19c7b76e21bc 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -173,9 +173,11 @@ void amd_pmu_lbr_read(void)
/*
* Check if a branch has been logged; if valid = 0, spec = 0
- * then no branch was recorded
+ * then no branch was recorded; if reserved = 1 then an
+ * erroneous branch was recorded (see Erratum 1452)
*/
- if (!entry.to.split.valid && !entry.to.split.spec)
+ if ((!entry.to.split.valid && !entry.to.split.spec) ||
+ entry.to.split.reserved)
continue;
perf_clear_branch_entry_bitfields(br + out);
@@ -308,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{
int ret = 0;
- /* LBR is not recommended in counting mode */
- if (!is_sampling_event(event))
- return -EINVAL;
-
ret = amd_pmu_lbr_setup_filter(event);
if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB;
@@ -400,26 +398,23 @@ void amd_pmu_lbr_enable_all(void)
wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select);
}
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN);
}
void amd_pmu_lbr_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- u64 dbg_ctl, dbg_extn_cfg;
if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return;
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
- rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
-
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ __amd_pmu_lbr_disable();
}
__init int amd_pmu_lbr_init(void)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 09050641ce5d..5b0dd07b1ef1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1644,6 +1644,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
while (++i < cpuc->n_events) {
cpuc->event_list[i-1] = cpuc->event_list[i];
cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+ cpuc->assign[i-1] = cpuc->assign[i];
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 326c8cd5aa2d..54eb142810fb 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -696,78 +696,78 @@ static const struct cstate_model srf_cstates __initconst = {
static const struct x86_cpu_id intel_cstates_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &srf_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &grr_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates),
-
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &adl_cstates),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_cstates),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhm_cstates),
+
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &snb_cstates),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates),
+
+ X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &snb_cstates),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &snb_cstates),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &snb_cstates),
+
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &hswult_cstates),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &hswult_cstates),
+
+ X86_MATCH_VFM(INTEL_CANNONLAKE_L, &cnl_cstates),
+
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_cstates),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_cstates),
+
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GOLDMONT_PLUS, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_L, &glm_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &srf_cstates),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &grr_cstates),
+
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_cstates),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates),
+
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &icl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_cstates),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &adl_cstates),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 2641ba620f12..e010bfed8417 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1237,11 +1237,11 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct pmu *pmu = event->pmu;
/*
- * Make sure we get updated with the first PEBS
- * event. It will trigger also during removal, but
- * that does not hurt:
+ * Make sure we get updated with the first PEBS event.
+ * During removal, ->pebs_data_cfg is still valid for
+ * the last PEBS event. Don't clear it.
*/
- if (cpuc->n_pebs == 1)
+ if ((cpuc->n_pebs == 1) && add)
cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW;
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 78cd5084104e..dc641b50814e 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -2,6 +2,7 @@
#include <linux/perf_event.h>
#include <linux/types.h>
+#include <asm/cpu_device_id.h>
#include <asm/perf_event.h>
#include <asm/msr.h>
@@ -1457,7 +1458,7 @@ void __init intel_pmu_lbr_init_atom(void)
* to have an operational LBR which can freeze
* on PMU interrupt
*/
- if (boot_cpu_data.x86_model == 28
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_BONNELL
&& boot_cpu_data.x86_stepping < 10) {
pr_cont("LBR disabled due to erratum");
return;
@@ -1693,6 +1694,7 @@ void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
lbr->from = x86_pmu.lbr_from;
lbr->to = x86_pmu.lbr_to;
lbr->info = x86_pmu.lbr_info;
+ lbr->has_callstack = x86_pmu_has_lbr_callstack();
}
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 8e2a12235e62..14db6d9d318b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -22,7 +22,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/intel_pt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "../perf_event.h"
#include "pt.h"
@@ -211,11 +211,11 @@ static int __init pt_pmu_hw_init(void)
}
/* model-specific quirks */
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
/* not setting BRANCH_EN will #GP, erratum BDM106 */
pt_pmu.branch_en_always_on = true;
break;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 258e2cdf28fa..419c517b8594 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1829,56 +1829,56 @@ static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
};
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &adl_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT_X, &gnr_uncore_init),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_CRESTMONT, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EP, &nhm_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE, &snb_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE, &ivb_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_L, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_G, &hsw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, &bdw_uncore_init),
+ X86_MATCH_VFM(INTEL_SANDYBRIDGE_X, &snbep_uncore_init),
+ X86_MATCH_VFM(INTEL_NEHALEM_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_WESTMERE_EX, &nhmex_uncore_init),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ivbep_uncore_init),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &hswep_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &bdx_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &knl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_KABYLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_COMETLAKE, &skl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE, &icl_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &icx_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, &tgl_l_uncore_init),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, &tgl_uncore_init),
+ X86_MATCH_VFM(INTEL_ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_TREMONT_D, &snr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, &adl_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_uncore_init),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_uncore_init),
{},
};
MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index 92da8aaa5966..466833478e81 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem-EX/Westmere-EX uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
/* NHM-EX event control */
@@ -1217,7 +1218,7 @@ static struct intel_uncore_type *nhmex_msr_uncores[] = {
void nhmex_uncore_cpu_init(void)
{
- if (boot_cpu_data.x86_model == 46)
+ if (boot_cpu_data.x86_vfm == INTEL_NEHALEM_EX)
uncore_nhmex = true;
else
nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 2eaf0f339849..74b8b21e8990 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* SandyBridge-EP/IvyTown uncore support */
+#include <asm/cpu_device_id.h>
#include "uncore.h"
#include "uncore_discovery.h"
@@ -3285,7 +3286,7 @@ void bdx_uncore_cpu_init(void)
uncore_msr_uncores = bdx_msr_uncores;
/* Detect systems with no SBOXes */
- if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_D || hswep_has_limit_sbox(BDX_PCU_DID))
uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
@@ -5394,7 +5395,7 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
/* Detect ICX-D system. This case is not supported */
- if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ if (boot_cpu_data.x86_vfm == INTEL_ICELAKE_D) {
pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
return;
}
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 9e237b30f017..45b1866ff051 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -2,7 +2,7 @@
#include <linux/perf_event.h>
#include <linux/sysfs.h>
#include <linux/nospec.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include "probe.h"
enum perf_msr_id {
@@ -43,75 +43,75 @@ static bool test_intel(int idx, void *data)
boot_cpu_data.x86 != 6)
return false;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
-
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_WESTMERE_EX:
-
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_SANDYBRIDGE_X:
-
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE_X:
-
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_X:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
-
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_D:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_BROADWELL_X:
- case INTEL_FAM6_SAPPHIRERAPIDS_X:
- case INTEL_FAM6_EMERALDRAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_X:
- case INTEL_FAM6_GRANITERAPIDS_D:
-
- case INTEL_FAM6_ATOM_SILVERMONT:
- case INTEL_FAM6_ATOM_SILVERMONT_D:
- case INTEL_FAM6_ATOM_AIRMONT:
-
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_D:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- case INTEL_FAM6_ATOM_TREMONT_D:
- case INTEL_FAM6_ATOM_TREMONT:
- case INTEL_FAM6_ATOM_TREMONT_L:
-
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_WESTMERE_EX:
+
+ case INTEL_SANDYBRIDGE:
+ case INTEL_SANDYBRIDGE_X:
+
+ case INTEL_IVYBRIDGE:
+ case INTEL_IVYBRIDGE_X:
+
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_X:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_D:
+ case INTEL_BROADWELL_G:
+ case INTEL_BROADWELL_X:
+ case INTEL_SAPPHIRERAPIDS_X:
+ case INTEL_EMERALDRAPIDS_X:
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_GRANITERAPIDS_D:
+
+ case INTEL_ATOM_SILVERMONT:
+ case INTEL_ATOM_SILVERMONT_D:
+ case INTEL_ATOM_AIRMONT:
+
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_D:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_TREMONT_D:
+ case INTEL_ATOM_TREMONT:
+ case INTEL_ATOM_TREMONT_L:
+
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
if (idx == PERF_MSR_SMI)
return true;
break;
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_SKYLAKE_X:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
- case INTEL_FAM6_COMETLAKE_L:
- case INTEL_FAM6_COMETLAKE:
- case INTEL_FAM6_ICELAKE_L:
- case INTEL_FAM6_ICELAKE:
- case INTEL_FAM6_ICELAKE_X:
- case INTEL_FAM6_ICELAKE_D:
- case INTEL_FAM6_TIGERLAKE_L:
- case INTEL_FAM6_TIGERLAKE:
- case INTEL_FAM6_ROCKETLAKE:
- case INTEL_FAM6_ALDERLAKE:
- case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ATOM_GRACEMONT:
- case INTEL_FAM6_RAPTORLAKE:
- case INTEL_FAM6_RAPTORLAKE_P:
- case INTEL_FAM6_RAPTORLAKE_S:
- case INTEL_FAM6_METEORLAKE:
- case INTEL_FAM6_METEORLAKE_L:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_SKYLAKE_X:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ case INTEL_COMETLAKE_L:
+ case INTEL_COMETLAKE:
+ case INTEL_ICELAKE_L:
+ case INTEL_ICELAKE:
+ case INTEL_ICELAKE_X:
+ case INTEL_ICELAKE_D:
+ case INTEL_TIGERLAKE_L:
+ case INTEL_TIGERLAKE:
+ case INTEL_ROCKETLAKE:
+ case INTEL_ALDERLAKE:
+ case INTEL_ALDERLAKE_L:
+ case INTEL_ATOM_GRACEMONT:
+ case INTEL_RAPTORLAKE:
+ case INTEL_RAPTORLAKE_P:
+ case INTEL_RAPTORLAKE_S:
+ case INTEL_METEORLAKE:
+ case INTEL_METEORLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index fb56518356ec..72b022a1e16c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
+static __always_inline void __amd_pmu_lbr_disable(void)
+{
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ }
+}
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS
#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index fb2b1961e5a3..ca5f687fa420 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = {
static int __init init_rapl_pmus(void)
{
int maxdie = topology_max_packages() * topology_max_dies_per_package();
- size_t size;
- size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
- rapl_pmus = kzalloc(size, GFP_KERNEL);
+ rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL);
if (!rapl_pmus)
return -ENOMEM;
@@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5fc45543e955..0569f579338b 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -105,7 +105,7 @@ static bool cpu_is_self(int cpu)
* IPI implementation on Hyper-V.
*/
static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
- bool exclude_self)
+ bool exclude_self)
{
struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
@@ -132,8 +132,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask,
- exclude_self ? cpu_is_self : NULL);
+ nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask,
+ exclude_self ? cpu_is_self : NULL);
/*
* 'nr_bank <= 0' means some CPUs in cpumask can't be
@@ -147,7 +147,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
}
status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
- ipi_arg, NULL);
+ ipi_arg, NULL);
ipi_mask_ex_done:
local_irq_restore(flags);
@@ -155,7 +155,7 @@ ipi_mask_ex_done:
}
static bool __send_ipi_mask(const struct cpumask *mask, int vector,
- bool exclude_self)
+ bool exclude_self)
{
int cur_cpu, vcpu, this_cpu = smp_processor_id();
struct hv_send_ipi ipi_arg;
@@ -181,7 +181,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
return false;
}
- if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR)
return false;
/*
@@ -218,7 +218,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
}
status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
- ipi_arg.cpu_mask);
+ ipi_arg.cpu_mask);
return hv_result_success(status);
do_ex_hypercall:
@@ -241,7 +241,7 @@ static bool __send_ipi_one(int cpu, int vector)
return false;
}
- if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
+ if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR)
return false;
if (vp >= 64)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 8f3a4d16bb79..17a71e92a343 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -667,14 +667,14 @@ void hyperv_cleanup(void)
hv_hypercall_pg = NULL;
/* Reset the hypercall page */
- hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL);
+ hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL);
hypercall_msr.enable = 0;
- hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hv_set_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
/* Reset the TSC page */
- tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC);
+ tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC);
tsc_msr.enable = 0;
- hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+ hv_set_msr(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index 68a0843d4750..3fa1f2ee7b0d 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -3,7 +3,6 @@
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
-#include <linux/acpi.h>
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
@@ -116,12 +115,11 @@ free_buf:
int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
{
- struct hv_add_logical_processor_in *input;
- struct hv_add_logical_processor_out *output;
+ struct hv_input_add_logical_processor *input;
+ struct hv_output_add_logical_processor *output;
u64 status;
unsigned long flags;
int ret = HV_STATUS_SUCCESS;
- int pxm = node_to_pxm(node);
/*
* When adding a logical processor, the hypervisor may return
@@ -137,11 +135,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
input->lp_index = lp_index;
input->apic_id = apic_id;
- input->flags = 0;
- input->proximity_domain_info.domain_id = pxm;
- input->proximity_domain_info.flags.reserved = 0;
- input->proximity_domain_info.flags.proximity_info_valid = 1;
- input->proximity_domain_info.flags.proximity_preferred = 1;
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
input, output);
local_irq_restore(flags);
@@ -166,7 +160,6 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
u64 status;
unsigned long irq_flags;
int ret = HV_STATUS_SUCCESS;
- int pxm = node_to_pxm(node);
/* Root VPs don't seem to need pages deposited */
if (partition_id != hv_current_partition_id) {
@@ -185,14 +178,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
input->vp_index = vp_index;
input->flags = flags;
input->subnode_type = HvSubnodeAny;
- if (node != NUMA_NO_NODE) {
- input->proximity_domain_info.domain_id = pxm;
- input->proximity_domain_info.flags.reserved = 0;
- input->proximity_domain_info.flags.proximity_info_valid = 1;
- input->proximity_domain_info.flags.proximity_preferred = 1;
- } else {
- input->proximity_domain_info.as_uint64 = 0;
- }
+ input->proximity_domain_info = hv_numa_node_to_pxm_info(node);
status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
local_irq_restore(irq_flags);
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 737d6f7a6155..151e851bef09 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -16,7 +16,7 @@
#include <asm/paravirt.h>
#include <asm/apic.h>
-static bool __initdata hv_pvspin = true;
+static bool hv_pvspin __initdata = true;
static void hv_qlock_kick(int cpu)
{
@@ -64,6 +64,7 @@ __visible bool hv_vcpu_is_preempted(int vcpu)
{
return false;
}
+
PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted);
void __init hv_init_spinlocks(void)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index edd2f35b2a5e..04775346369c 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -12,6 +12,7 @@
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/realmode.h>
+#include <../kernel/smpboot.h>
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
@@ -33,7 +34,6 @@ void __init hv_vtl_init_platform(void)
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_init_noop;
x86_platform.get_wallclock = get_rtc_noop;
x86_platform.set_wallclock = set_rtc_noop;
@@ -65,7 +65,7 @@ static void hv_vtl_ap_entry(void)
((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
}
-static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored)
{
u64 status;
int ret = 0;
@@ -79,7 +79,9 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
struct ldttss_desc *ldt;
struct desc_struct *gdt;
- u64 rsp = current->thread.sp;
+ struct task_struct *idle = idle_thread_get(cpu);
+ u64 rsp = (unsigned long)idle->thread.sp;
+
u64 rip = (u64)&hv_vtl_ap_entry;
native_store_gdt(&gdt_ptr);
@@ -206,7 +208,15 @@ static int hv_vtl_apicid_to_vp_id(u32 apic_id)
static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
{
- int vp_id;
+ int vp_id, cpu;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apicid))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
vp_id = hv_vtl_apicid_to_vp_id(apicid);
@@ -220,7 +230,7 @@ static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, start_eip);
+ return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index f896eed4516c..5af926c050f0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -56,6 +56,8 @@ static inline void disable_acpi(void)
extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
+extern int acpi_blacklisted(void);
+
static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
static inline void acpi_disable_pci(void)
{
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index fcd20c6dc7f9..ba99ef75f56c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -117,7 +117,7 @@ extern void callthunks_patch_builtin_calls(void);
extern void callthunks_patch_module_calls(struct callthunk_sites *sites,
struct module *mod);
extern void *callthunks_translate_call_dest(void *dest);
-extern int x86_call_depth_emit_accounting(u8 **pprog, void *func);
+extern int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip);
#else
static __always_inline void callthunks_patch_builtin_calls(void) {}
static __always_inline void
@@ -128,7 +128,7 @@ static __always_inline void *callthunks_translate_call_dest(void *dest)
return dest;
}
static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
- void *func)
+ void *func, void *ip)
{
return 0;
}
@@ -286,20 +286,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: : "i" (0), ## input)
-/*
- * This is similar to alternative_input. But it has two features and
- * respective instructions.
- *
- * If CPU has feature2, newinstr2 is used.
- * Otherwise, if CPU has feature1, newinstr1 is used.
- * Otherwise, oldinstr is used.
- */
-#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
- ft_flags2, input...) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
- newinstr2, ft_flags2) \
- : : "i" (0), ## input)
-
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \
asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
@@ -307,7 +293,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
/* Like alternative_io, but for replacing a direct call with another one. */
#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
- asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
+ asm_inline volatile (ALTERNATIVE("call %c[old]", "call %c[new]", ft_flags) \
: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
/*
@@ -316,12 +302,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, function1 is used.
* Otherwise, old function is used.
*/
-#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
- output, input...) \
- asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
- "call %P[new2]", ft_flags2) \
- : output, ASM_CALL_CONSTRAINT \
- : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
+ output, input...) \
+ asm_inline volatile (ALTERNATIVE_2("call %c[old]", "call %c[new1]", ft_flags1, \
+ "call %c[new2]", ft_flags2) \
+ : output, ASM_CALL_CONSTRAINT \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input)
/*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 94ce0f7c9d3a..9327eb00e96d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -13,6 +13,8 @@
#include <asm/mpspec.h>
#include <asm/msr.h>
#include <asm/hardirq.h>
+#include <asm/io.h>
+#include <asm/posted_intr.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -91,14 +93,14 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
static inline u32 native_apic_mem_read(u32 reg)
{
- return *((volatile u32 *)(APIC_BASE + reg));
+ return readl((void __iomem *)(APIC_BASE + reg));
}
static inline void native_apic_mem_eoi(void)
@@ -499,6 +501,11 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
return !!(irr & (1U << (vector % 32)));
}
+static inline bool is_vector_pending(unsigned int vector)
+{
+ return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
+}
+
/*
* Warm reset vector position:
*/
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 076bf8dee702..25466c4d2134 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -14,6 +14,7 @@
#include <asm/asm.h>
#include <asm/fred.h>
#include <asm/gsseg.h>
+#include <asm/nospec-branch.h>
#ifndef CONFIG_X86_CMPXCHG64
extern void cmpxchg8b_emu(void);
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index ca8eed1d496a..2bec0c89a95c 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -229,9 +229,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define _ASM_EXTABLE_UA(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_UACCESS)
-#define _ASM_EXTABLE_CPY(from, to) \
- _ASM_EXTABLE_TYPE(from, to, EX_TYPE_COPY)
-
#define _ASM_EXTABLE_FAULT(from, to) \
_ASM_EXTABLE_TYPE(from, to, EX_TYPE_FAULT)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec04350..55b4d24356ea 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
}
#define arch_atomic_add_return arch_atomic_add_return
-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
- return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)
static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
}
#define arch_atomic_fetch_add arch_atomic_fetch_add
-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)
static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 3486d91b8595..8db2ec4d6cda 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,32 @@ typedef struct {
#define ATOMIC64_INIT(val) { (val) }
+/*
+ * Read an atomic64_t non-atomically.
+ *
+ * This is intended to be used in cases where a subsequent atomic operation
+ * will handle the torn value, and can be used to prime the first iteration
+ * of unconditional try_cmpxchg() loops, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ *
+ * This is NOT safe to use where the value is not always checked by a
+ * subsequent atomic operation, such as in conditional try_cmpxchg() loops
+ * that can break before the atomic operation, e.g.:
+ *
+ * s64 val = arch_atomic64_read_nonatomic(v);
+ * do {
+ * if (condition(val))
+ * break;
+ * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i);
+ */
+static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v)
+{
+ /* See comment in arch_atomic_read(). */
+ return __READ_ONCE(v->counter);
+}
+
#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
#ifndef ATOMIC64_EXPORT
#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
@@ -24,7 +50,7 @@ typedef struct {
#ifdef CONFIG_X86_CMPXCHG64
#define __alternative_atomic64(f, g, out, in...) \
- asm volatile("call %P[func]" \
+ asm volatile("call %c[func]" \
: out : [func] "i" (atomic64_##g##_cx8), ## in)
#define ATOMIC64_DECL(sym) ATOMIC64_DECL_ONE(sym##_cx8)
@@ -61,12 +87,18 @@ ATOMIC64_DECL(add_unless);
#undef __ATOMIC64_DECL
#undef ATOMIC64_EXPORT
-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
- return arch_cmpxchg64(&v->counter, o, n);
+ return arch_cmpxchg64(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
@@ -195,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
}
static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
}
static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
}
static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);
- while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
- return old;
+ return val;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0feedf7..ae12acae5b06 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return
-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
- return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)
static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index fe1e7e3cc844..63bdc6b85219 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -79,6 +79,9 @@ do { \
#define __smp_mb__before_atomic() do { } while (0)
#define __smp_mb__after_atomic() do { } while (0)
+/* Writing to CR3 provides a full memory barrier in switch_mm(). */
+#define smp_mb__after_switch_mm() do { } while (0)
+
#include <asm-generic/barrier.h>
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index a38cc0afc90a..3e5b111e619d 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -6,11 +6,6 @@
#include <asm/pgtable_types.h>
#include <uapi/asm/boot.h>
-/* Physical address where kernel should be loaded. */
-#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
- + (CONFIG_PHYSICAL_ALIGN - 1)) \
- & ~(CONFIG_PHYSICAL_ALIGN - 1))
-
/* Minimum kernel alignment, as a power of two */
#ifdef CONFIG_X86_64
# define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
@@ -81,6 +76,7 @@
#ifndef __ASSEMBLY__
extern unsigned int output_len;
+extern const unsigned long kernel_text_size;
extern const unsigned long kernel_total_size;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index b5731c51f0f4..ed2797f132ce 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -3,103 +3,150 @@
#define _ASM_X86_CMPXCHG_32_H
/*
- * Note: if you use set64_bit(), __cmpxchg64(), or their variants,
+ * Note: if you use __cmpxchg64(), or their variants,
* you need to test for the feature in boot_cpu_data.
*/
-#ifdef CONFIG_X86_CMPXCHG64
-#define arch_cmpxchg64(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_cmpxchg64_local(ptr, o, n) \
- ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
- (unsigned long long)(n)))
-#define arch_try_cmpxchg64(ptr, po, n) \
- __try_cmpxchg64((ptr), (unsigned long long *)(po), \
- (unsigned long long)(n))
-#endif
+union __u64_halves {
+ u64 full;
+ struct {
+ u32 low, high;
+ };
+};
+
+#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+
+static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX);
+}
-static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
{
- u64 prev;
- asm volatile(LOCK_PREFIX "cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_cmpxchg64(ptr, old, new,);
}
-static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(_lock "cmpxchg8b %[ptr]" \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
{
- u64 prev;
- asm volatile("cmpxchg8b %1"
- : "=A" (prev),
- "+m" (*ptr)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32)),
- "0" (old)
- : "memory");
- return prev;
+ return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX);
}
-static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
{
- bool success;
- u64 old = *pold;
- asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]"
- CC_SET(z)
- : CC_OUT(z) (success),
- [ptr] "+m" (*ptr),
- "+A" (old)
- : "b" ((u32)new),
- "c" ((u32)(new >> 32))
- : "memory");
-
- if (unlikely(!success))
- *pold = old;
- return success;
+ return __arch_try_cmpxchg64(ptr, oldp, new,);
}
-#ifndef CONFIG_X86_CMPXCHG64
+#ifdef CONFIG_X86_CMPXCHG64
+
+#define arch_cmpxchg64 __cmpxchg64
+
+#define arch_cmpxchg64_local __cmpxchg64_local
+
+#define arch_try_cmpxchg64 __try_cmpxchg64
+
+#define arch_try_cmpxchg64_local __try_cmpxchg64_local
+
+#else
+
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
* to simulate the cmpxchg8b on the 80386 and 80486 CPU.
*/
-#define arch_cmpxchg64(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io(LOCK_PREFIX_HERE \
- "call cmpxchg8b_emu", \
- "lock; cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
-
-
-#define arch_cmpxchg64_local(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (o); \
- __typeof__(*(ptr)) __new = (n); \
- alternative_io("call cmpxchg8b_emu", \
- "cmpxchg8b (%%esi)" , \
- X86_FEATURE_CX8, \
- "=A" (__ret), \
- "S" ((ptr)), "0" (__old), \
- "b" ((unsigned int)__new), \
- "c" ((unsigned int)(__new>>32)) \
- : "memory"); \
- __ret; })
+#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = (_old), }, \
+ n = { .full = (_new), }; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ : [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ o.full; \
+})
+
+static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_cmpxchg64 arch_cmpxchg64
+
+static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
+{
+ return __arch_cmpxchg64_emu(ptr, old, new, ,);
+}
+#define arch_cmpxchg64_local arch_cmpxchg64_local
+
+#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \
+({ \
+ union __u64_halves o = { .full = *(_oldp), }, \
+ n = { .full = (_new), }; \
+ bool ret; \
+ \
+ asm volatile(ALTERNATIVE(_lock_loc \
+ "call cmpxchg8b_emu", \
+ _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \
+ CC_SET(e) \
+ : CC_OUT(e) (ret), \
+ [ptr] "+m" (*(_ptr)), \
+ "+a" (o.low), "+d" (o.high) \
+ : "b" (n.low), "c" (n.high), "S" (_ptr) \
+ : "memory"); \
+ \
+ if (unlikely(!ret)) \
+ *(_oldp) = o.full; \
+ \
+ likely(ret); \
+})
+
+static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; ");
+}
+#define arch_try_cmpxchg64 arch_try_cmpxchg64
+
+static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new)
+{
+ return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,);
+}
+#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local
#endif
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 44b08b53ab32..5e241306db26 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,6 +20,12 @@
arch_try_cmpxchg((ptr), (po), (n)); \
})
+#define arch_try_cmpxchg64_local(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg_local((ptr), (po), (n)); \
+})
+
union __u128_halves {
u128 full;
struct {
@@ -62,7 +68,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old,
asm volatile(_lock "cmpxchg16b %[ptr]" \
CC_SET(e) \
: CC_OUT(e) (ret), \
- [ptr] "+m" (*ptr), \
+ [ptr] "+m" (*(_ptr)), \
"+a" (o.low), "+d" (o.high) \
: "b" (n.low), "c" (n.high) \
: "memory"); \
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index fb7388bbc212..aa6c8f8ca958 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -22,8 +22,10 @@ static inline void cc_set_mask(u64 mask)
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
+void cc_random_init(void);
#else
#define cc_vendor (CC_VENDOR_NONE)
+static const u64 cc_mask = 0;
static inline u64 cc_mkenc(u64 val)
{
@@ -34,6 +36,7 @@ static inline u64 cc_mkdec(u64 val)
{
return val;
}
+static inline void cc_random_init(void) { }
#endif
#endif /* _ASM_X86_COCO_H */
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index eb8fcede9e3b..970a232009c3 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -3,6 +3,39 @@
#define _ASM_X86_CPU_DEVICE_ID
/*
+ * Can't use <linux/bitfield.h> because it generates expressions that
+ * cannot be used in structure initializers. Bitfield construction
+ * here must match the union in struct cpuinfo_86:
+ * union {
+ * struct {
+ * __u8 x86_model;
+ * __u8 x86;
+ * __u8 x86_vendor;
+ * __u8 x86_reserved;
+ * };
+ * __u32 x86_vfm;
+ * };
+ */
+#define VFM_MODEL_BIT 0
+#define VFM_FAMILY_BIT 8
+#define VFM_VENDOR_BIT 16
+#define VFM_RSVD_BIT 24
+
+#define VFM_MODEL_MASK GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
+#define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
+#define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
+
+#define VFM_MODEL(vfm) (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
+#define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
+#define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
+
+#define VFM_MAKE(_vendor, _family, _model) ( \
+ ((_model) << VFM_MODEL_BIT) | \
+ ((_family) << VFM_FAMILY_BIT) | \
+ ((_vendor) << VFM_VENDOR_BIT) \
+)
+
+/*
* Declare drivers belonging to specific x86 CPUs
* Similar in spirit to pci_device_id and related PCI functions
*
@@ -49,6 +82,16 @@
.driver_data = (unsigned long) _data \
}
+#define X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
+ _steppings, _feature, _data) { \
+ .vendor = _vendor, \
+ .family = _family, \
+ .model = _model, \
+ .steppings = _steppings, \
+ .feature = _feature, \
+ .driver_data = (unsigned long) _data \
+}
+
/**
* X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
@@ -164,6 +207,56 @@
X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
steppings, X86_FEATURE_ANY, data)
+/**
+ * X86_MATCH_VFM - Match encoded vendor/family/model
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * Stepping and feature are set to wildcards
+ */
+#define X86_MATCH_VFM(vfm, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, X86_FEATURE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @steppings: Bitmask of steppings to match
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * feature is set to wildcard
+ */
+#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ steppings, X86_FEATURE_ANY, data)
+
+/**
+ * X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
+ * @vfm: Encoded 8-bits each for vendor, family, model
+ * @feature: A X86_FEATURE bit
+ * @data: Driver specific data or NULL. The internal storage
+ * format is unsigned long. The supplied value, pointer
+ * etc. is cast to unsigned long internally.
+ *
+ * Steppings is set to wildcard
+ */
+#define X86_MATCH_VFM_FEATURE(vfm, feature, data) \
+ X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
+ VFM_VENDOR(vfm), \
+ VFM_FAMILY(vfm), \
+ VFM_MODEL(vfm), \
+ X86_STEPPING_ANY, feature, data)
+
/*
* Match specific microcode revisions.
*
@@ -190,6 +283,14 @@ struct x86_cpu_desc {
.x86_microcode_rev = (revision), \
}
+#define AMD_CPU_DESC(fam, model, stepping, revision) { \
+ .x86_family = (fam), \
+ .x86_vendor = X86_VENDOR_AMD, \
+ .x86_model = (model), \
+ .x86_stepping = (stepping), \
+ .x86_microcode_rev = (revision), \
+}
+
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a1273698fc43..0b9611da6c53 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -33,6 +33,8 @@ enum cpuid_leafs
CPUID_7_EDX,
CPUID_8000_001F_EAX,
CPUID_8000_0021_EAX,
+ CPUID_LNX_5,
+ NR_CPUID_WORDS,
};
#define X86_CAP_FMT_NUM "%d:%d"
@@ -91,8 +93,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 21))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 22))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -116,8 +119,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 21))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 22))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
@@ -125,8 +129,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define this_cpu_has(bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
- x86_this_cpu_test_bit(bit, \
- (unsigned long __percpu *)&cpu_info.x86_capability))
+ x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
/*
* This macro is for detection of features which need kernel
@@ -146,8 +149,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
extern void setup_clear_cpu_cap(unsigned int bit);
extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
-#define setup_force_cpu_cap(bit) do { \
- set_cpu_cap(&boot_cpu_data, bit); \
+#define setup_force_cpu_cap(bit) do { \
+ \
+ if (!boot_cpu_has(bit)) \
+ WARN_ON(alternatives_patched); \
+ \
+ set_cpu_cap(&boot_cpu_data, bit); \
set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
@@ -168,11 +175,10 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm goto(
- ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
- " testb %[bitnum]," _ASM_RIP(%P[cap_byte]) "\n"
+ " testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
".popsection\n"
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index f0337f7bcf16..3c7434329661 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 21 /* N 32-bit words worth of info */
+#define NCAPINTS 22 /* N 32-bit words worth of info */
#define NBUGINTS 2 /* N 32-bit bug flags */
/*
@@ -460,6 +460,18 @@
#define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */
/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0x80000022, etc and Linux defined features.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */
+#define X86_FEATURE_CLEAR_BHB_LOOP (21*32+ 1) /* "" Clear branch history at syscall entry using SW loop */
+#define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */
+#define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */
+#define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+
+/*
* BUG word(s)
*/
#define X86_BUG(x) (NCAPINTS*32 + (x))
@@ -507,4 +519,5 @@
#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
+#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/crash_core.h b/arch/x86/include/asm/crash_reserve.h
index 76af98f4e801..7835b2cdff04 100644
--- a/arch/x86/include/asm/crash_core.h
+++ b/arch/x86/include/asm/crash_reserve.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _X86_CRASH_CORE_H
-#define _X86_CRASH_CORE_H
+#ifndef _X86_CRASH_RESERVE_H
+#define _X86_CRASH_RESERVE_H
/* 16M alignment for crash kernel regions */
#define CRASH_ALIGN SZ_16M
@@ -39,4 +39,6 @@ static inline unsigned long crash_low_size_default(void)
#endif
}
-#endif /* _X86_CRASH_CORE_H */
+#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
+
+#endif /* _X86_CRASH_RESERVE_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index da4054fbf533..c492bdc97b05 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -155,6 +155,7 @@
#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK19 (DISABLE_SEV_SNP)
#define DISABLED_MASK20 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define DISABLED_MASK21 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index e8f58ddd06d9..2e74a7f0e935 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -17,6 +17,7 @@ extern bool e820__mapped_all(u64 start, u64 end, enum e820_type type);
extern void e820__range_add (u64 start, u64 size, enum e820_type type);
extern u64 e820__range_update(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
extern u64 e820__range_remove(u64 start, u64 size, enum e820_type old_type, bool check_type);
+extern u64 e820__range_update_table(struct e820_table *t, u64 start, u64 size, enum e820_type old_type, enum e820_type new_type);
extern void e820__print_table(char *who);
extern int e820__update_table(struct e820_table *table);
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 7acf0383be80..906b0d5541e8 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -36,7 +36,7 @@
#define EX_TYPE_DEFAULT 1
#define EX_TYPE_FAULT 2
#define EX_TYPE_UACCESS 3
-#define EX_TYPE_COPY 4
+/* unused, was: #define EX_TYPE_COPY 4 */
#define EX_TYPE_CLEAR_FS 5
#define EX_TYPE_FPU_RESTORE 6
#define EX_TYPE_BPF 7
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 66837b8c67f1..c67fa6ad098a 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,7 +15,7 @@ typedef struct {
unsigned int irq_spurious_count;
unsigned int icr_read_retry_count;
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
unsigned int kvm_posted_intr_ipis;
unsigned int kvm_posted_intr_wakeup_ipis;
unsigned int kvm_posted_intr_nested_ipis;
@@ -44,10 +44,16 @@ typedef struct {
unsigned int irq_hv_reenlightenment_count;
unsigned int hyperv_stimer0_count;
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ unsigned int posted_msi_notification_count;
+#endif
} ____cacheline_aligned irq_cpustat_t;
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+#ifdef CONFIG_X86_POSTED_MSI
+DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+#endif
#define __ARCH_IRQ_STAT
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2ff26f53cd62..3787d26810c1 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -182,7 +182,7 @@ enum hv_isolation_type {
#define HV_X64_MSR_HYPERCALL 0x40000001
/* MSR used to provide vcpu index */
-#define HV_REGISTER_VP_INDEX 0x40000002
+#define HV_X64_MSR_VP_INDEX 0x40000002
/* MSR used to reset the guest OS. */
#define HV_X64_MSR_RESET 0x40000003
@@ -191,10 +191,10 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_RUNTIME 0x40000010
/* MSR used to read the per-partition time reference counter */
-#define HV_REGISTER_TIME_REF_COUNT 0x40000020
+#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
/* A partition's reference time stamp counter (TSC) page */
-#define HV_REGISTER_REFERENCE_TSC 0x40000021
+#define HV_X64_MSR_REFERENCE_TSC 0x40000021
/* MSR used to retrieve the TSC frequency */
#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
@@ -209,61 +209,61 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
-#define HV_REGISTER_SCONTROL 0x40000080
-#define HV_REGISTER_SVERSION 0x40000081
-#define HV_REGISTER_SIEFP 0x40000082
-#define HV_REGISTER_SIMP 0x40000083
-#define HV_REGISTER_EOM 0x40000084
-#define HV_REGISTER_SINT0 0x40000090
-#define HV_REGISTER_SINT1 0x40000091
-#define HV_REGISTER_SINT2 0x40000092
-#define HV_REGISTER_SINT3 0x40000093
-#define HV_REGISTER_SINT4 0x40000094
-#define HV_REGISTER_SINT5 0x40000095
-#define HV_REGISTER_SINT6 0x40000096
-#define HV_REGISTER_SINT7 0x40000097
-#define HV_REGISTER_SINT8 0x40000098
-#define HV_REGISTER_SINT9 0x40000099
-#define HV_REGISTER_SINT10 0x4000009A
-#define HV_REGISTER_SINT11 0x4000009B
-#define HV_REGISTER_SINT12 0x4000009C
-#define HV_REGISTER_SINT13 0x4000009D
-#define HV_REGISTER_SINT14 0x4000009E
-#define HV_REGISTER_SINT15 0x4000009F
+#define HV_X64_MSR_SCONTROL 0x40000080
+#define HV_X64_MSR_SVERSION 0x40000081
+#define HV_X64_MSR_SIEFP 0x40000082
+#define HV_X64_MSR_SIMP 0x40000083
+#define HV_X64_MSR_EOM 0x40000084
+#define HV_X64_MSR_SINT0 0x40000090
+#define HV_X64_MSR_SINT1 0x40000091
+#define HV_X64_MSR_SINT2 0x40000092
+#define HV_X64_MSR_SINT3 0x40000093
+#define HV_X64_MSR_SINT4 0x40000094
+#define HV_X64_MSR_SINT5 0x40000095
+#define HV_X64_MSR_SINT6 0x40000096
+#define HV_X64_MSR_SINT7 0x40000097
+#define HV_X64_MSR_SINT8 0x40000098
+#define HV_X64_MSR_SINT9 0x40000099
+#define HV_X64_MSR_SINT10 0x4000009A
+#define HV_X64_MSR_SINT11 0x4000009B
+#define HV_X64_MSR_SINT12 0x4000009C
+#define HV_X64_MSR_SINT13 0x4000009D
+#define HV_X64_MSR_SINT14 0x4000009E
+#define HV_X64_MSR_SINT15 0x4000009F
/*
* Define synthetic interrupt controller model specific registers for
* nested hypervisor.
*/
-#define HV_REGISTER_NESTED_SCONTROL 0x40001080
-#define HV_REGISTER_NESTED_SVERSION 0x40001081
-#define HV_REGISTER_NESTED_SIEFP 0x40001082
-#define HV_REGISTER_NESTED_SIMP 0x40001083
-#define HV_REGISTER_NESTED_EOM 0x40001084
-#define HV_REGISTER_NESTED_SINT0 0x40001090
+#define HV_X64_MSR_NESTED_SCONTROL 0x40001080
+#define HV_X64_MSR_NESTED_SVERSION 0x40001081
+#define HV_X64_MSR_NESTED_SIEFP 0x40001082
+#define HV_X64_MSR_NESTED_SIMP 0x40001083
+#define HV_X64_MSR_NESTED_EOM 0x40001084
+#define HV_X64_MSR_NESTED_SINT0 0x40001090
/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
-#define HV_REGISTER_STIMER0_CONFIG 0x400000B0
-#define HV_REGISTER_STIMER0_COUNT 0x400000B1
-#define HV_REGISTER_STIMER1_CONFIG 0x400000B2
-#define HV_REGISTER_STIMER1_COUNT 0x400000B3
-#define HV_REGISTER_STIMER2_CONFIG 0x400000B4
-#define HV_REGISTER_STIMER2_COUNT 0x400000B5
-#define HV_REGISTER_STIMER3_CONFIG 0x400000B6
-#define HV_REGISTER_STIMER3_COUNT 0x400000B7
+#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
/* Hyper-V guest idle MSR */
#define HV_X64_MSR_GUEST_IDLE 0x400000F0
/* Hyper-V guest crash notification MSR's */
-#define HV_REGISTER_CRASH_P0 0x40000100
-#define HV_REGISTER_CRASH_P1 0x40000101
-#define HV_REGISTER_CRASH_P2 0x40000102
-#define HV_REGISTER_CRASH_P3 0x40000103
-#define HV_REGISTER_CRASH_P4 0x40000104
-#define HV_REGISTER_CRASH_CTL 0x40000105
+#define HV_X64_MSR_CRASH_P0 0x40000100
+#define HV_X64_MSR_CRASH_P1 0x40000101
+#define HV_X64_MSR_CRASH_P2 0x40000102
+#define HV_X64_MSR_CRASH_P3 0x40000103
+#define HV_X64_MSR_CRASH_P4 0x40000104
+#define HV_X64_MSR_CRASH_CTL 0x40000105
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -276,31 +276,38 @@ enum hv_isolation_type {
/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0)
-/* Register name aliases for temporary compatibility */
-#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
-#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
-#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT
-#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG
-#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT
-#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG
-#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT
-#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG
-#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL
-#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION
-#define HV_X64_MSR_SIMP HV_REGISTER_SIMP
-#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP
-#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX
-#define HV_X64_MSR_EOM HV_REGISTER_EOM
-#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0
-#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15
-#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0
-#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1
-#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2
-#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3
-#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4
-#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL
-#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
-#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/*
+ * To support arch-generic code calling hv_set/get_register:
+ * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl
+ * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall
+ */
+#define HV_MSR_CRASH_P0 (HV_X64_MSR_CRASH_P0)
+#define HV_MSR_CRASH_P1 (HV_X64_MSR_CRASH_P1)
+#define HV_MSR_CRASH_P2 (HV_X64_MSR_CRASH_P2)
+#define HV_MSR_CRASH_P3 (HV_X64_MSR_CRASH_P3)
+#define HV_MSR_CRASH_P4 (HV_X64_MSR_CRASH_P4)
+#define HV_MSR_CRASH_CTL (HV_X64_MSR_CRASH_CTL)
+
+#define HV_MSR_VP_INDEX (HV_X64_MSR_VP_INDEX)
+#define HV_MSR_TIME_REF_COUNT (HV_X64_MSR_TIME_REF_COUNT)
+#define HV_MSR_REFERENCE_TSC (HV_X64_MSR_REFERENCE_TSC)
+
+#define HV_MSR_SINT0 (HV_X64_MSR_SINT0)
+#define HV_MSR_SVERSION (HV_X64_MSR_SVERSION)
+#define HV_MSR_SCONTROL (HV_X64_MSR_SCONTROL)
+#define HV_MSR_SIEFP (HV_X64_MSR_SIEFP)
+#define HV_MSR_SIMP (HV_X64_MSR_SIMP)
+#define HV_MSR_EOM (HV_X64_MSR_EOM)
+
+#define HV_MSR_NESTED_SCONTROL (HV_X64_MSR_NESTED_SCONTROL)
+#define HV_MSR_NESTED_SVERSION (HV_X64_MSR_NESTED_SVERSION)
+#define HV_MSR_NESTED_SIEFP (HV_X64_MSR_NESTED_SIEFP)
+#define HV_MSR_NESTED_SIMP (HV_X64_MSR_NESTED_SIMP)
+#define HV_MSR_NESTED_EOM (HV_X64_MSR_NESTED_EOM)
+#define HV_MSR_NESTED_SINT0 (HV_X64_MSR_NESTED_SINT0)
+
+#define HV_MSR_STIMER0_CONFIG (HV_X64_MSR_STIMER0_CONFIG)
+#define HV_MSR_STIMER0_COUNT (HV_X64_MSR_STIMER0_COUNT)
/*
* Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index 4212c00c9708..9d69f3f8dbab 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -56,17 +56,6 @@ struct stat64 {
unsigned long long st_ino;
} __attribute__((packed));
-#define IA32_STACK_TOP IA32_PAGE_OFFSET
-
-#ifdef __KERNEL__
-struct linux_binprm;
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
- unsigned long stack_top, int exec_stack);
-struct mm_struct;
-extern void ia32_pick_mmap_layout(struct mm_struct *mm);
-
-#endif
-
extern bool __ia32_enabled;
static __always_inline bool ia32_enabled(void)
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
deleted file mode 100644
index aa065c94ccf5..000000000000
--- a/arch/x86/include/asm/ia32_unistd.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_IA32_UNISTD_H
-#define _ASM_X86_IA32_UNISTD_H
-
-/*
- * This file contains the system call numbers of the ia32 compat ABI,
- * this is for the kernel only.
- */
-#define __SYSCALL_ia32_NR(x) (x)
-#include <asm/unistd_32_ia32.h>
-
-#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 47d4c04d103d..d4f24499b256 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -741,7 +741,7 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work);
# endif
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi);
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi);
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi);
@@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested
# define fred_sysvec_kvm_posted_intr_nested_ipi NULL
#endif
+# ifdef CONFIG_X86_POSTED_MSI
+DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
+#else
+# define fred_sysvec_posted_msi_notification NULL
+# endif
+
#if IS_ENABLED(CONFIG_HYPERV)
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index b65e9c46b922..f81a851c46dc 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -40,136 +40,221 @@
* their own names :-(
*/
+#define IFM(_fam, _model) VFM_MAKE(X86_VENDOR_INTEL, _fam, _model)
+
/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
#define INTEL_FAM6_ANY X86_MODEL_ANY
+/* Wildcard match for FAM6 so X86_MATCH_VFM(ANY) works */
+#define INTEL_ANY IFM(X86_FAMILY_ANY, X86_MODEL_ANY)
#define INTEL_FAM6_CORE_YONAH 0x0E
+#define INTEL_CORE_YONAH IFM(6, 0x0E)
#define INTEL_FAM6_CORE2_MEROM 0x0F
+#define INTEL_CORE2_MEROM IFM(6, 0x0F)
#define INTEL_FAM6_CORE2_MEROM_L 0x16
+#define INTEL_CORE2_MEROM_L IFM(6, 0x16)
#define INTEL_FAM6_CORE2_PENRYN 0x17
+#define INTEL_CORE2_PENRYN IFM(6, 0x17)
#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D
+#define INTEL_CORE2_DUNNINGTON IFM(6, 0x1D)
#define INTEL_FAM6_NEHALEM 0x1E
+#define INTEL_NEHALEM IFM(6, 0x1E)
#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
+#define INTEL_NEHALEM_G IFM(6, 0x1F) /* Auburndale / Havendale */
#define INTEL_FAM6_NEHALEM_EP 0x1A
+#define INTEL_NEHALEM_EP IFM(6, 0x1A)
#define INTEL_FAM6_NEHALEM_EX 0x2E
+#define INTEL_NEHALEM_EX IFM(6, 0x2E)
#define INTEL_FAM6_WESTMERE 0x25
+#define INTEL_WESTMERE IFM(6, 0x25)
#define INTEL_FAM6_WESTMERE_EP 0x2C
+#define INTEL_WESTMERE_EP IFM(6, 0x2C)
#define INTEL_FAM6_WESTMERE_EX 0x2F
+#define INTEL_WESTMERE_EX IFM(6, 0x2F)
#define INTEL_FAM6_SANDYBRIDGE 0x2A
+#define INTEL_SANDYBRIDGE IFM(6, 0x2A)
#define INTEL_FAM6_SANDYBRIDGE_X 0x2D
+#define INTEL_SANDYBRIDGE_X IFM(6, 0x2D)
#define INTEL_FAM6_IVYBRIDGE 0x3A
+#define INTEL_IVYBRIDGE IFM(6, 0x3A)
#define INTEL_FAM6_IVYBRIDGE_X 0x3E
+#define INTEL_IVYBRIDGE_X IFM(6, 0x3E)
#define INTEL_FAM6_HASWELL 0x3C
+#define INTEL_HASWELL IFM(6, 0x3C)
#define INTEL_FAM6_HASWELL_X 0x3F
+#define INTEL_HASWELL_X IFM(6, 0x3F)
#define INTEL_FAM6_HASWELL_L 0x45
+#define INTEL_HASWELL_L IFM(6, 0x45)
#define INTEL_FAM6_HASWELL_G 0x46
+#define INTEL_HASWELL_G IFM(6, 0x46)
#define INTEL_FAM6_BROADWELL 0x3D
+#define INTEL_BROADWELL IFM(6, 0x3D)
#define INTEL_FAM6_BROADWELL_G 0x47
+#define INTEL_BROADWELL_G IFM(6, 0x47)
#define INTEL_FAM6_BROADWELL_X 0x4F
+#define INTEL_BROADWELL_X IFM(6, 0x4F)
#define INTEL_FAM6_BROADWELL_D 0x56
+#define INTEL_BROADWELL_D IFM(6, 0x56)
#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
+#define INTEL_SKYLAKE_L IFM(6, 0x4E) /* Sky Lake */
#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
+#define INTEL_SKYLAKE IFM(6, 0x5E) /* Sky Lake */
#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
+#define INTEL_SKYLAKE_X IFM(6, 0x55) /* Sky Lake */
/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
+#define INTEL_KABYLAKE_L IFM(6, 0x8E) /* Sky Lake */
/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
+#define INTEL_KABYLAKE IFM(6, 0x9E) /* Sky Lake */
/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
+#define INTEL_COMETLAKE IFM(6, 0xA5) /* Sky Lake */
#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
+#define INTEL_COMETLAKE_L IFM(6, 0xA6) /* Sky Lake */
#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
+#define INTEL_CANNONLAKE_L IFM(6, 0x66) /* Palm Cove */
#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
+#define INTEL_ICELAKE_X IFM(6, 0x6A) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
+#define INTEL_ICELAKE_D IFM(6, 0x6C) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
+#define INTEL_ICELAKE IFM(6, 0x7D) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
+#define INTEL_ICELAKE_L IFM(6, 0x7E) /* Sunny Cove */
#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
+#define INTEL_ICELAKE_NNPI IFM(6, 0x9D) /* Sunny Cove */
#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
+#define INTEL_ROCKETLAKE IFM(6, 0xA7) /* Cypress Cove */
#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
+#define INTEL_TIGERLAKE_L IFM(6, 0x8C) /* Willow Cove */
#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
+#define INTEL_TIGERLAKE IFM(6, 0x8D) /* Willow Cove */
#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
+#define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */
#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF
+#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF)
#define INTEL_FAM6_GRANITERAPIDS_X 0xAD
+#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD)
#define INTEL_FAM6_GRANITERAPIDS_D 0xAE
+#define INTEL_GRANITERAPIDS_D IFM(6, 0xAE)
/* "Hybrid" Processors (P-Core/E-Core) */
#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
+#define INTEL_LAKEFIELD IFM(6, 0x8A) /* Sunny Cove / Tremont */
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE IFM(6, 0x97) /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_ALDERLAKE_L IFM(6, 0x9A) /* Golden Cove / Gracemont */
#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */
+#define INTEL_RAPTORLAKE IFM(6, 0xB7) /* Raptor Cove / Enhanced Gracemont */
#define INTEL_FAM6_RAPTORLAKE_P 0xBA
+#define INTEL_RAPTORLAKE_P IFM(6, 0xBA)
#define INTEL_FAM6_RAPTORLAKE_S 0xBF
+#define INTEL_RAPTORLAKE_S IFM(6, 0xBF)
#define INTEL_FAM6_METEORLAKE 0xAC
+#define INTEL_METEORLAKE IFM(6, 0xAC)
#define INTEL_FAM6_METEORLAKE_L 0xAA
+#define INTEL_METEORLAKE_L IFM(6, 0xAA)
#define INTEL_FAM6_ARROWLAKE_H 0xC5
+#define INTEL_ARROWLAKE_H IFM(6, 0xC5)
#define INTEL_FAM6_ARROWLAKE 0xC6
+#define INTEL_ARROWLAKE IFM(6, 0xC6)
+#define INTEL_FAM6_ARROWLAKE_U 0xB5
+#define INTEL_ARROWLAKE_U IFM(6, 0xB5)
#define INTEL_FAM6_LUNARLAKE_M 0xBD
+#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
+#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
+#define INTEL_ATOM_BONNELL_MID IFM(6, 0x26) /* Silverthorne, Lincroft */
#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
+#define INTEL_ATOM_SALTWELL IFM(6, 0x36) /* Cedarview */
#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
+#define INTEL_ATOM_SALTWELL_MID IFM(6, 0x27) /* Penwell */
#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
+#define INTEL_ATOM_SALTWELL_TABLET IFM(6, 0x35) /* Cloverview */
#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
+#define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */
+#define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */
#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
+#define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
+#define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */
#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
+#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */
#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */
+#define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
+#define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */
+#define INTEL_ATOM_GOLDMONT_D IFM(6, 0x5F) /* Denverton */
/* Note: the micro-architecture is "Goldmont Plus" */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_ATOM_GOLDMONT_PLUS IFM(6, 0x7A) /* Gemini Lake */
#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */
+#define INTEL_ATOM_TREMONT_D IFM(6, 0x86) /* Jacobsville */
#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
+#define INTEL_ATOM_TREMONT IFM(6, 0x96) /* Elkhart Lake */
#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
+#define INTEL_ATOM_TREMONT_L IFM(6, 0x9C) /* Jasper Lake */
#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */
+#define INTEL_ATOM_GRACEMONT IFM(6, 0xBE) /* Alderlake N */
#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */
+#define INTEL_ATOM_CRESTMONT_X IFM(6, 0xAF) /* Sierra Forest */
#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */
+#define INTEL_ATOM_CRESTMONT IFM(6, 0xB6) /* Grand Ridge */
#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */
+#define INTEL_ATOM_DARKMONT_X IFM(6, 0xDD) /* Clearwater Forest */
/* Xeon Phi */
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
+#define INTEL_XEON_PHI_KNL IFM(6, 0x57) /* Knights Landing */
#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
+#define INTEL_XEON_PHI_KNM IFM(6, 0x85) /* Knights Mill */
/* Family 5 */
#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
+#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 836c170d3087..194dfff84cb1 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -29,7 +29,7 @@ struct irq_desc;
extern void fixup_irqs(void);
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
#endif
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 7a2ed154a5e1..5036f13ab69f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain;
}
+extern bool enable_posted_msi;
+
+static inline bool posted_msi_supported(void)
+{
+ return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
#else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 798183867d78..b71ad173f877 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -100,7 +100,7 @@
}
#define ASM_CALL_ARG0 \
- "call %P[__func] \n" \
+ "call %c[__func] \n" \
ASM_REACHABLE
#define ASM_CALL_ARG1 \
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3a19904c2db6..13aea8fc3d45 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -84,11 +84,9 @@
#define HYPERVISOR_CALLBACK_VECTOR 0xf3
/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
#define POSTED_INTR_WAKEUP_VECTOR 0xf1
#define POSTED_INTR_NESTED_VECTOR 0xf0
-#endif
#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
@@ -99,10 +97,16 @@
#define LOCAL_TIMER_VECTOR 0xec
+/*
+ * Posted interrupt notification vector for all device MSIs delivered to
+ * the host kernel.
+ */
+#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb
+
#define NR_VECTORS 256
#ifdef CONFIG_X86_LOCAL_APIC
-#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR
+#define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR
#else
#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index ab24ce207988..110d7f29ca9a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -103,7 +103,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index 058bc636356a..f852b13aeefe 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -12,11 +12,9 @@ BUILD_BUG_ON(1)
* a NULL definition, for example if "static_call_cond()" will be used
* at the call sites.
*/
-KVM_X86_PMU_OP(hw_event_available)
-KVM_X86_PMU_OP(pmc_idx_to_pmc)
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
-KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
KVM_X86_PMU_OP(is_valid_msr)
KVM_X86_PMU_OP(get_msr)
KVM_X86_PMU_OP(set_msr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18cbde14cf81..6efd1497b026 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,6 +536,7 @@ struct kvm_pmc {
#define KVM_PMC_MAX_FIXED 3
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
+
struct kvm_pmu {
u8 version;
unsigned nr_arch_gp_counters;
@@ -854,6 +855,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
struct kvm_hypervisor_cpuid kvm_cpuid;
+ bool is_amd_compatible;
/*
* FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly
@@ -1468,6 +1470,15 @@ struct kvm_arch {
*/
bool shadow_root_allocated;
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * If set, the VM has (or had) an external write tracking user, and
+ * thus all write tracking metadata has been allocated, even if KVM
+ * itself isn't using write tracking.
+ */
+ bool external_write_tracking_enabled;
+#endif
+
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
spinlock_t hv_root_tdp_lock;
@@ -1665,7 +1676,8 @@ struct kvm_x86_ops {
void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1733,8 +1745,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
-
void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
/*
@@ -1882,8 +1892,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
}
#endif /* CONFIG_HYPERV */
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
+
+/* Enable perf NMI and timer modes to work, and minimise false positives. */
#define kvm_arch_pmi_in_guest(vcpu) \
- ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
+ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
void __init kvm_mmu_x86_module_init(void);
int kvm_mmu_vendor_module_init(void);
@@ -2048,7 +2066,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
@@ -2241,7 +2259,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index de3118305838..dfd2e9699bd7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -13,6 +13,7 @@
#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */
#define MCG_EXT_P BIT_ULL(9) /* Extended registers available */
#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */
+#define MCG_SEAM_NR BIT_ULL(12) /* MCG_STATUS_SEAM_NR supported */
#define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */
#define MCG_EXT_CNT_SHIFT 16
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
@@ -25,6 +26,7 @@
#define MCG_STATUS_EIPV BIT_ULL(1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP BIT_ULL(2) /* machine check in progress */
#define MCG_STATUS_LMCES BIT_ULL(3) /* LMCE signaled */
+#define MCG_STATUS_SEAM_NR BIT_ULL(12) /* Machine check inside SEAM non-root mode */
/* MCG_EXT_CTL register defines */
#define MCG_EXT_CTL_LMCE_EN BIT_ULL(0) /* Enable LMCE */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490..ce4677b8b735 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -75,7 +75,7 @@ typedef struct {
.lock = __MUTEX_INITIALIZER(mm.context.lock), \
}
-void leave_mm(int cpu);
+void leave_mm(void);
#define leave_mm leave_mm
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ce4ce8720d55..390c4d13956d 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -293,24 +293,24 @@ static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
#endif
-static inline bool hv_is_synic_reg(unsigned int reg)
+static inline bool hv_is_synic_msr(unsigned int reg)
{
- return (reg >= HV_REGISTER_SCONTROL) &&
- (reg <= HV_REGISTER_SINT15);
+ return (reg >= HV_X64_MSR_SCONTROL) &&
+ (reg <= HV_X64_MSR_SINT15);
}
-static inline bool hv_is_sint_reg(unsigned int reg)
+static inline bool hv_is_sint_msr(unsigned int reg)
{
- return (reg >= HV_REGISTER_SINT0) &&
- (reg <= HV_REGISTER_SINT15);
+ return (reg >= HV_X64_MSR_SINT0) &&
+ (reg <= HV_X64_MSR_SINT15);
}
-u64 hv_get_register(unsigned int reg);
-void hv_set_register(unsigned int reg, u64 value);
-u64 hv_get_non_nested_register(unsigned int reg);
-void hv_set_non_nested_register(unsigned int reg, u64 value);
+u64 hv_get_msr(unsigned int reg);
+void hv_set_msr(unsigned int reg, u64 value);
+u64 hv_get_non_nested_msr(unsigned int reg);
+void hv_set_non_nested_msr(unsigned int reg, u64 value);
-static __always_inline u64 hv_raw_get_register(unsigned int reg)
+static __always_inline u64 hv_raw_get_msr(unsigned int reg)
{
return __rdmsr(reg);
}
@@ -331,10 +331,10 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
{
return -1;
}
-static inline void hv_set_register(unsigned int reg, u64 value) { }
-static inline u64 hv_get_register(unsigned int reg) { return 0; }
-static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { }
-static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
+static inline void hv_set_msr(unsigned int reg, u64 value) { }
+static inline u64 hv_get_msr(unsigned int reg) { return 0; }
+static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { }
+static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 05956bd8bacf..e022e6eb766c 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -61,10 +61,13 @@
#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+#define SPEC_CTRL_BHI_DIS_S_SHIFT 10 /* Disable Branch History Injection behavior */
+#define SPEC_CTRL_BHI_DIS_S BIT(SPEC_CTRL_BHI_DIS_S_SHIFT)
/* A mask for bits which the kernel toggles when controlling mitigations */
#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
- | SPEC_CTRL_RRSBA_DIS_S)
+ | SPEC_CTRL_RRSBA_DIS_S \
+ | SPEC_CTRL_BHI_DIS_S)
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
@@ -163,6 +166,14 @@
* are restricted to targets in
* kernel.
*/
+#define ARCH_CAP_BHI_NO BIT(20) /*
+ * CPU is not affected by Branch
+ * History Injection.
+ */
+#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
+ * IA32_XAPIC_DISABLE_STATUS MSR
+ * supported
+ */
#define ARCH_CAP_PBRSB_NO BIT(24) /*
* Not susceptible to Post-Barrier
* Return Stack Buffer Predictions.
@@ -185,11 +196,6 @@
* File.
*/
-#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
- * IA32_XAPIC_DISABLE_STATUS MSR
- * supported
- */
-
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
* Writeback and invalidate the
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index fc3a8a3c7ffe..ff5f1ecc7d1e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -262,11 +262,20 @@
.Lskip_rsb_\@:
.endm
+/*
+ * The CALL to srso_alias_untrain_ret() must be patched in directly at
+ * the spot where untraining must be done, ie., srso_alias_untrain_ret()
+ * must be the target of a CALL instruction instead of indirectly
+ * jumping to a wrapper which then calls it. Therefore, this macro is
+ * called outside of __UNTRAIN_RET below, for the time being, before the
+ * kernel can support nested alternatives with arbitrary nesting.
+ */
+.macro CALL_UNTRAIN_RET
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
-#define CALL_UNTRAIN_RET "call entry_untrain_ret"
-#else
-#define CALL_UNTRAIN_RET ""
+ ALTERNATIVE_2 "", "call entry_untrain_ret", X86_FEATURE_UNRET, \
+ "call srso_alias_untrain_ret", X86_FEATURE_SRSO_ALIAS
#endif
+.endm
/*
* Mitigate RETBleed for AMD/Hygon Zen uarch. Requires KERNEL CR3 because the
@@ -282,8 +291,8 @@
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
VALIDATE_UNRET_END
- ALTERNATIVE_3 "", \
- CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ CALL_UNTRAIN_RET
+ ALTERNATIVE_2 "", \
"call entry_ibpb", \ibpb_feature, \
__stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH
#endif
@@ -317,6 +326,19 @@
ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
.endm
+#ifdef CONFIG_X86_64
+.macro CLEAR_BRANCH_HISTORY
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
+.endm
+
+.macro CLEAR_BRANCH_HISTORY_VMEXIT
+ ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT
+.endm
+#else
+#define CLEAR_BRANCH_HISTORY
+#define CLEAR_BRANCH_HISTORY_VMEXIT
+#endif
+
#else /* __ASSEMBLY__ */
#define ANNOTATE_RETPOLINE_SAFE \
@@ -342,6 +364,8 @@ extern void retbleed_return_thunk(void);
static inline void retbleed_return_thunk(void) {}
#endif
+extern void srso_alias_untrain_ret(void);
+
#ifdef CONFIG_MITIGATION_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
@@ -357,6 +381,10 @@ extern void srso_alias_return_thunk(void);
extern void entry_untrain_ret(void);
extern void entry_ibpb(void);
+#ifdef CONFIG_X86_64
+extern void clear_bhb_loop(void);
+#endif
+
extern void (*x86_return_thunk)(void);
extern void __warn_thunk(void);
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 9da9c8a2f1df..52f1b4ff0cc1 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -31,10 +31,12 @@
#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
-#define __PHYSICAL_START ALIGN(CONFIG_PHYSICAL_START, \
- CONFIG_PHYSICAL_ALIGN)
+/* Physical address where kernel should be loaded. */
+#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ + (CONFIG_PHYSICAL_ALIGN - 1)) \
+ & ~(CONFIG_PHYSICAL_ALIGN - 1))
-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
+#define __START_KERNEL (__START_KERNEL_map + LOAD_PHYSICAL_ADDR)
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958ebaf626..3bedee1801e2 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -59,36 +59,24 @@
#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset this_cpu_read(this_cpu_off)
-#ifdef CONFIG_USE_X86_SEG_SUPPORT
-/*
- * Efficient implementation for cases in which the compiler supports
- * named address spaces. Allows the compiler to perform additional
- * optimizations that can save more instructions.
- */
-#define arch_raw_cpu_ptr(ptr) \
-({ \
- unsigned long tcp_ptr__; \
- tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
-})
-#else /* CONFIG_USE_X86_SEG_SUPPORT */
/*
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
+ *
+ * arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
+ * kernel, because games are played with CONFIG_X86_64 there and
+ * sizeof(this_cpu_off) becames 4.
*/
-#define arch_raw_cpu_ptr(ptr) \
+#ifndef BUILD_VDSO32_64
+#define arch_raw_cpu_ptr(_ptr) \
({ \
- unsigned long tcp_ptr__; \
- asm ("mov " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (__my_cpu_var(this_cpu_off))); \
- \
- tcp_ptr__ += (unsigned long)(ptr); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+ unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \
+ tcp_ptr__ += (__force unsigned long)(_ptr); \
+ (typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \
})
-#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#else
+#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; })
+#endif
#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
@@ -102,8 +90,8 @@
#endif /* CONFIG_SMP */
#define __my_cpu_type(var) typeof(var) __percpu_seg_override
-#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
-#define __my_cpu_var(var) (*__my_cpu_ptr(&var))
+#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr)
+#define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
#define __percpu_arg(x) __percpu_prefix "%" #x
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
@@ -230,25 +218,26 @@ do { \
})
/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define raw_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ raw_cpu_write(_var, _nval); \
+ pxo_old__; \
+})
+
+/*
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
+ */
+#define this_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})
/*
@@ -428,10 +417,6 @@ do { \
* actually per-thread variables implemented as per-CPU variables and
* thus stable for the duration of the respective task.
*/
-#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
-#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
-#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
-#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
#ifdef CONFIG_USE_X86_SEG_SUPPORT
@@ -500,6 +485,10 @@ do { \
#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
+#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
+#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
+
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -509,18 +498,6 @@ do { \
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
-({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
-})
-
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
@@ -534,9 +511,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -563,6 +540,8 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
+#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
+
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -575,41 +554,41 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
-#endif
-
-static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
- const unsigned long __percpu *addr)
-{
- unsigned long __percpu *a =
- (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
-#ifdef CONFIG_X86_64
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
+#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp)
#else
- return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
-#endif
-}
+/* There is no generic 64 bit read stable operation for 32 bit targets. */
+#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
-static inline bool x86_this_cpu_variable_test_bit(int nr,
- const unsigned long __percpu *addr)
-{
- bool oldbit;
+#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp)
+#endif
- asm volatile("btl "__percpu_arg(2)",%1"
- CC_SET(c)
- : CC_OUT(c) (oldbit)
- : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr));
+#define x86_this_cpu_constant_test_bit(_nr, _var) \
+({ \
+ unsigned long __percpu *addr__ = \
+ (unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
+ !!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
+})
- return oldbit;
-}
+#define x86_this_cpu_variable_test_bit(_nr, _var) \
+({ \
+ bool oldbit; \
+ \
+ asm volatile("btl %[nr], " __percpu_arg([var]) \
+ CC_SET(c) \
+ : CC_OUT(c) (oldbit) \
+ : [var] "m" (__my_cpu_var(_var)), \
+ [nr] "rI" (_nr)); \
+ oldbit; \
+})
-#define x86_this_cpu_test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? x86_this_cpu_constant_test_bit((nr), (addr)) \
- : x86_this_cpu_variable_test_bit((nr), (addr)))
+#define x86_this_cpu_test_bit(_nr, _var) \
+ (__builtin_constant_p(_nr) \
+ ? x86_this_cpu_constant_test_bit(_nr, _var) \
+ : x86_this_cpu_variable_test_bit(_nr, _var))
#include <asm-generic/percpu.h>
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 3736b8a46c04..7f1e17250546 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -555,6 +555,7 @@ struct x86_pmu_lbr {
unsigned int from;
unsigned int to;
unsigned int info;
+ bool has_callstack;
};
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index df0f7d4a96f3..315535ffb258 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -31,7 +31,8 @@ struct seq_file;
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
bool user);
-void ptdump_walk_pgd_level_checkwx(void);
+bool ptdump_walk_pgd_level_checkwx(void);
+#define ptdump_check_wx ptdump_walk_pgd_level_checkwx
void ptdump_walk_user_pgd_level_checkwx(void);
/*
@@ -41,10 +42,8 @@ void ptdump_walk_user_pgd_level_checkwx(void);
#define pgprot_decrypted(prot) __pgprot(cc_mkdec(pgprot_val(prot)))
#ifdef CONFIG_DEBUG_WX
-#define debug_checkwx() ptdump_walk_pgd_level_checkwx()
#define debug_checkwx_user() ptdump_walk_user_pgd_level_checkwx()
#else
-#define debug_checkwx() do { } while (0)
#define debug_checkwx_user() do { } while (0)
#endif
@@ -252,8 +251,8 @@ static inline unsigned long pgd_pfn(pgd_t pgd)
return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}
-#define p4d_leaf p4d_large
-static inline int p4d_large(p4d_t p4d)
+#define p4d_leaf p4d_leaf
+static inline bool p4d_leaf(p4d_t p4d)
{
/* No 512 GiB pages yet */
return 0;
@@ -261,14 +260,14 @@ static inline int p4d_large(p4d_t p4d)
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-#define pmd_leaf pmd_large
-static inline int pmd_large(pmd_t pte)
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pte)
{
return pmd_flags(pte) & _PAGE_PSE;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
+/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */
static inline int pmd_trans_huge(pmd_t pmd)
{
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
@@ -956,13 +955,13 @@ static inline int pte_same(pte_t a, pte_t b)
return a.pte == b.pte;
}
-static inline pte_t pte_next_pfn(pte_t pte)
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
if (__pte_needs_invert(pte_val(pte)))
- return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
- return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
+ return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
-#define pte_next_pfn pte_next_pfn
+#define pte_advance_pfn pte_advance_pfn
static inline int pte_present(pte_t a)
{
@@ -1086,8 +1085,8 @@ static inline pmd_t *pud_pgtable(pud_t pud)
*/
#define pud_page(pud) pfn_to_page(pud_pfn(pud))
-#define pud_leaf pud_large
-static inline int pud_large(pud_t pud)
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
{
return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
(_PAGE_PSE | _PAGE_PRESENT);
@@ -1097,12 +1096,6 @@ static inline int pud_bad(pud_t pud)
{
return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
-#else
-#define pud_leaf pud_large
-static inline int pud_large(pud_t pud)
-{
- return 0;
-}
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#if CONFIG_PGTABLE_LEVELS > 3
@@ -1419,8 +1412,8 @@ static inline bool pgdp_maps_userspace(void *__ptr)
return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}
-#define pgd_leaf pgd_large
-static inline int pgd_large(pgd_t pgd) { return 0; }
+#define pgd_leaf pgd_leaf
+static inline bool pgd_leaf(pgd_t pgd) { return false; }
#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0b748ee16b3d..b78644962626 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -148,7 +148,7 @@
#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | \
_PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
- _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP)
+ _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
@@ -173,6 +173,7 @@ enum page_cache_mode {
};
#endif
+#define _PAGE_CC (_AT(pteval_t, cc_mask))
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
@@ -566,6 +567,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
new file mode 100644
index 000000000000..de788b400fba
--- /dev/null
+++ b/arch/x86/include/asm/posted_intr.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_POSTED_INTR_H
+#define _X86_POSTED_INTR_H
+#include <asm/irq_vectors.h>
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+#define PID_TABLE_ENTRY_VALID 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ union {
+ u32 pir[8]; /* Posted interrupt requested */
+ u64 pir64[4];
+ };
+ union {
+ struct {
+ u16 notifications; /* Suppress and outstanding bits */
+ u8 nv;
+ u8 rsvd_2;
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
+}
+
+/* Non-atomic helpers */
+static inline void __pi_set_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications |= BIT(POSTED_INTR_SN);
+}
+
+static inline void __pi_clear_sn(struct pi_desc *pi_desc)
+{
+ pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
+}
+
+#ifdef CONFIG_X86_POSTED_MSI
+/*
+ * Not all external vectors are subject to interrupt remapping, e.g. IOMMU's
+ * own interrupts. Here we do not distinguish them since those vector bits in
+ * PIR will always be zero.
+ */
+static inline bool pi_pending_this_cpu(unsigned int vector)
+{
+ struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
+ return false;
+
+ return test_bit(vector, (unsigned long *)pid->pir);
+}
+
+extern void intel_posted_msi_init(void);
+#else
+static inline bool pi_pending_this_cpu(unsigned int vector) { return false; }
+
+static inline void intel_posted_msi_init(void) {};
+#endif /* X86_POSTED_MSI */
+
+#endif /* _X86_POSTED_INTR_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 811548f131f4..cb4f6c513c48 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -108,9 +108,23 @@ struct cpuinfo_topology {
};
struct cpuinfo_x86 {
- __u8 x86; /* CPU family */
- __u8 x86_vendor; /* CPU vendor */
- __u8 x86_model;
+ union {
+ /*
+ * The particular ordering (low-to-high) of (vendor,
+ * family, model) is done in case range of models, like
+ * it is usually done on AMD, need to be compared.
+ */
+ struct {
+ __u8 x86_model;
+ /* CPU family */
+ __u8 x86;
+ /* CPU vendor */
+ __u8 x86_vendor;
+ __u8 x86_reserved;
+ };
+ /* combined vendor, family, model */
+ __u32 x86_vfm;
+ };
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
@@ -472,7 +486,6 @@ struct thread_struct {
unsigned long iopl_emul;
unsigned int iopl_warn:1;
- unsigned int sig_on_uaccess_err:1;
/*
* Protection Keys Register for Userspace. Loaded immediately on
@@ -587,7 +600,7 @@ extern char ignore_fpu_irq;
# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 %P1"
+# define BASE_PREFETCH "prefetcht0 %1"
#endif
/*
@@ -598,7 +611,7 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchnta %P1",
+ alternative_input(BASE_PREFETCH, "prefetchnta %1",
X86_FEATURE_XMM,
"m" (*(const char *)x));
}
@@ -610,7 +623,7 @@ static inline void prefetch(const void *x)
*/
static __always_inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ alternative_input(BASE_PREFETCH, "prefetchw %1",
X86_FEATURE_3DNOWPREFETCH,
"m" (*(const char *)x));
}
@@ -636,12 +649,10 @@ static __always_inline void prefetchw(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-extern unsigned long __end_init_task[];
+extern unsigned long __top_init_kernel_stack[];
#define INIT_THREAD { \
- .sp = (unsigned long)&__end_init_task - \
- TOP_OF_KERNEL_STACK_PADDING - \
- sizeof(struct pt_regs), \
+ .sp = (unsigned long)&__top_init_kernel_stack, \
}
extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 043758a2e627..365798cb4408 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -23,19 +23,14 @@ extern int of_ioapic;
extern u64 initial_dtb;
extern void add_dtb(u64 data);
void x86_of_pci_init(void);
-void x86_dtb_parse_smp_config(void);
+void x86_flattree_get_config(void);
#else
static inline void add_dtb(u64 data) { }
static inline void x86_of_pci_init(void) { }
-static inline void x86_dtb_parse_smp_config(void) { }
+static inline void x86_flattree_get_config(void) { }
#define of_ioapic 0
#endif
-#ifdef CONFIG_OF_EARLY_FLATTREE
-void x86_flattree_get_config(void);
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
extern char cmd_line[COMMAND_LINE_SIZE];
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index cde8357bb226..a053c1293975 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -85,6 +85,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key);
#define virt_spin_lock virt_spin_lock
static inline bool virt_spin_lock(struct qspinlock *lock)
{
+ int val;
+
if (!static_branch_likely(&virt_spin_lock_key))
return false;
@@ -94,10 +96,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock)
* horrible lock 'holder' preemption issues.
*/
- do {
- while (atomic_read(&lock->val) != 0)
- cpu_relax();
- } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
+ __retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+ cpu_relax();
+ goto __retry;
+ }
return true;
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index ef9697f20129..0a985784be9b 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
*
* void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock)
* {
- * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0);
+ * u8 lockval = _Q_LOCKED_VAL;
*
- * if (likely(lockval == _Q_LOCKED_VAL))
+ * if (try_cmpxchg(&lock->locked, &lockval, 0))
* return;
* pv_queued_spin_unlock_slowpath(lock, lockval);
* }
@@ -40,10 +40,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
#define PV_UNLOCK_ASM \
FRAME_BEGIN \
"push %rdx\n\t" \
- "mov $0x1,%eax\n\t" \
+ "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \
"xor %edx,%edx\n\t" \
LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
- "cmp $0x1,%al\n\t" \
"jne .slowpath\n\t" \
"pop %rdx\n\t" \
FRAME_END \
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 7ba1726b71c7..e9187ddd3d1f 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -99,6 +99,7 @@
#define REQUIRED_MASK18 0
#define REQUIRED_MASK19 0
#define REQUIRED_MASK20 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
+#define REQUIRED_MASK21 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index fef16e398161..42bcd42d70d1 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -9,7 +9,7 @@
#endif
#ifdef CONFIG_COMPAT
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
#define __NR_seccomp_read_32 __NR_ia32_read
#define __NR_seccomp_write_32 __NR_ia32_write
#define __NR_seccomp_exit_32 __NR_ia32_exit
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 9477b4053bce..ca20cc4e5826 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -140,7 +140,7 @@ struct secrets_os_area {
#define VMPCK_KEY_LEN 32
/* See the SNP spec version 0.9 for secrets page format */
-struct snp_secrets_page_layout {
+struct snp_secrets_page {
u32 version;
u32 imien : 1,
rsvd1 : 31;
@@ -218,17 +218,16 @@ void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
unsigned long npages);
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __noreturn snp_abort(void);
+void snp_dmi_setup(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
u64 sev_get_status(void);
-void kdump_sev_callback(void);
void sev_show_status(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
@@ -244,12 +243,12 @@ static inline void __init
early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
static inline void __init
early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { }
-static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
static inline void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_memory_private(unsigned long vaddr, unsigned long npages) { }
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
+static inline void snp_dmi_setup(void) { }
static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
@@ -258,7 +257,6 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
static inline u64 sev_get_status(void) { return 0; }
-static inline void kdump_sev_callback(void) { }
static inline void sev_show_status(void) { }
#endif
@@ -270,6 +268,8 @@ int psmash(u64 pfn);
int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable);
int rmp_make_shared(u64 pfn, enum pg_level level);
void snp_leak_pages(u64 pfn, unsigned int npages);
+void kdump_sev_callback(void);
+void snp_fixup_e820_tables(void);
#else
static inline bool snp_probe_rmptable_info(void) { return false; }
static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; }
@@ -282,6 +282,8 @@ static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 as
}
static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; }
static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
+static inline void kdump_sev_callback(void) { }
+static inline void snp_fixup_e820_tables(void) {}
#endif
#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1be13b2dfe8b..64df897c0ee3 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,8 +37,6 @@ extern int phys_to_target_node(phys_addr_t start);
#define phys_to_target_node phys_to_target_node
extern int memory_add_physaddr_to_nid(u64 start);
#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-extern int numa_fill_memblks(u64 start, u64 end);
-#define numa_fill_memblks numa_fill_memblks
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2e9fc5c400cd..aec6e2d3aa1d 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -182,8 +182,8 @@ static __always_inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte 0x3e; clflush %P0",
- ".byte 0x66; clflush %P0",
+ alternative_io(".byte 0x3e; clflush %0",
+ ".byte 0x66; clflush %0",
X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
}
@@ -205,9 +205,9 @@ static inline void clwb(volatile void *__p)
#ifdef CONFIG_X86_USER_SHADOW_STACK
static inline int write_user_shstk_64(u64 __user *addr, u64 val)
{
- asm goto("1: wrussq %[val], (%[addr])\n"
+ asm goto("1: wrussq %[val], %[addr]\n"
_ASM_EXTABLE(1b, %l[fail])
- :: [addr] "r" (addr), [val] "r" (val)
+ :: [addr] "m" (*addr), [val] "r" (val)
:: fail);
return 0;
fail:
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 857d364b9888..9d0b324eab21 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -30,37 +30,40 @@ void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosw"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosw"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET32
static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosl"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosl"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#define __HAVE_ARCH_MEMSET64
static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
{
- long d0, d1;
- asm volatile("rep\n\t"
- "stosq"
- : "=&c" (d0), "=&D" (d1)
- : "a" (v), "1" (s), "0" (n)
- : "memory");
- return s;
+ const __auto_type s0 = s;
+ asm volatile (
+ "rep stosq"
+ : "+D" (s), "+c" (n)
+ : "a" (v)
+ : "memory"
+ );
+ return s0;
}
#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index a800abb1a992..d8416b3bf832 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -12,11 +12,6 @@
/* image of the saved processor state */
struct saved_context {
- /*
- * On x86_32, all segment registers except gs are saved at kernel
- * entry in pt_regs.
- */
- u16 gs;
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
struct saved_msrs saved_msrs;
@@ -27,6 +22,11 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ /*
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
+ */
+ u16 gs;
bool misc_enable_saved;
} __attribute__((packed));
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 87a7b917d30e..728c98175b9c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -358,10 +358,10 @@ struct sev_es_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u64 vmpl0_ssp;
- u64 vmpl1_ssp;
- u64 vmpl2_ssp;
- u64 vmpl3_ssp;
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+ u64 pl3_ssp;
u64 u_cet;
u8 reserved_0xc8[2];
u8 vmpl;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index f44e2f9ab65d..2fc7bc3863ff 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,19 +16,17 @@
#include <asm/thread_info.h> /* for TS_COMPAT */
#include <asm/unistd.h>
+/* This is used purely for kernel/trace/trace_syscalls.c */
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
extern const sys_call_ptr_t sys_call_table[];
-#if defined(CONFIG_X86_32)
-#define ia32_sys_call_table sys_call_table
-#else
/*
* These may not exist, but still put the prototypes in so we
* can use IS_ENABLED().
*/
-extern const sys_call_ptr_t ia32_sys_call_table[];
-extern const sys_call_ptr_t x32_sys_call_table[];
-#endif
+extern long ia32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x32_sys_call(const struct pt_regs *, unsigned int nr);
+extern long x64_sys_call(const struct pt_regs *, unsigned int nr);
/*
* Only the low 32 bits of orig_ax are meaningful, so we return int.
@@ -127,6 +125,7 @@ static inline int syscall_get_arch(struct task_struct *task)
}
bool do_syscall_64(struct pt_regs *regs, int nr);
+void do_int80_emulation(struct pt_regs *regs);
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 345aafbc1964..6259f1937fe7 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -15,7 +15,7 @@
extern void text_poke_early(void *addr, const void *opcode, size_t len);
-extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len);
/*
* Clear and restore the kernel write-protection flag on the local CPU.
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 237dc8cdd12b..0f9bab92a43d 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -78,7 +78,7 @@ extern int __get_user_bad(void);
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
__chk_user_ptr(ptr); \
- asm volatile("call __" #fn "_%P4" \
+ asm volatile("call __" #fn "_%c4" \
: "=a" (__ret_gu), "=r" (__val_gu), \
ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
@@ -177,7 +177,7 @@ extern void __put_user_nocheck_8(void);
__chk_user_ptr(__ptr); \
__ptr_pu = __ptr; \
__val_pu = __x; \
- asm volatile("call __" #fn "_%P[size]" \
+ asm volatile("call __" #fn "_%c[size]" \
: "=c" (__ret_pu), \
ASM_CALL_CONSTRAINT \
: "0" (__ptr_pu), \
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 8e048ca980df..0ef36190abe6 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -300,7 +300,7 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
#define vdso_cycles_ok arch_vdso_cycles_ok
/*
- * x86 specific delta calculation.
+ * x86 specific calculation of nanoseconds for the current cycle count
*
* The regular implementation assumes that clocksource reads are globally
* monotonic. The TSC can be slightly off across sockets which can cause
@@ -308,8 +308,8 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* jump.
*
* Therefore it needs to be verified that @cycles are greater than
- * @last. If not then use @last, which is the base time of the current
- * conversion period.
+ * @vd->cycles_last. If not then use @vd->cycles_last, which is the base
+ * time of the current conversion period.
*
* This variant also uses a custom mask because while the clocksource mask of
* all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
@@ -317,25 +317,37 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
* declares everything with the MSB/Sign-bit set as invalid. Therefore the
* effective mask is S64_MAX.
*/
-static __always_inline
-u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+static __always_inline u64 vdso_calc_ns(const struct vdso_data *vd, u64 cycles, u64 base)
{
- /*
- * Due to the MSB/Sign-bit being used as invalid marker (see
- * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
- */
- u64 delta = (cycles - last) & S64_MAX;
+ u64 delta = cycles - vd->cycle_last;
/*
- * Due to the above mentioned TSC wobbles, filter out negative motion.
- * Per the above masking, the effective sign bit is now bit 62.
+ * Negative motion and deltas which can cause multiplication
+ * overflow require special treatment. This check covers both as
+ * negative motion is guaranteed to be greater than @vd::max_cycles
+ * due to unsigned comparison.
+ *
+ * Due to the MSB/Sign-bit being used as invalid marker (see
+ * arch_vdso_cycles_valid() above), the effective mask is S64_MAX,
+ * but that case is also unlikely and will also take the unlikely path
+ * here.
*/
- if (unlikely(delta & (1ULL << 62)))
- return 0;
+ if (unlikely(delta > vd->max_cycles)) {
+ /*
+ * Due to the above mentioned TSC wobbles, filter out
+ * negative motion. Per the above masking, the effective
+ * sign bit is now bit 62.
+ */
+ if (delta & (1ULL << 62))
+ return base >> vd->shift;
+
+ /* Handle multiplication overflow gracefully */
+ return mul_u64_u32_add_u64_shr(delta & S64_MAX, vd->mult, base, vd->shift);
+ }
- return delta * mult;
+ return ((delta * vd->mult) + base) >> vd->shift;
}
-#define vdso_calc_delta vdso_calc_delta
+#define vdso_calc_ns vdso_calc_ns
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 9e8ac5073ecb..62ee19909903 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -84,7 +84,7 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
-#define free_vm86(t) do { } while(0)
+#define free_vm86(task) do { (void)(task); } while(0)
#endif /* CONFIG_VM86 */
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index c6a7eed03914..266daf5b5b84 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -25,6 +25,7 @@
#define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */
#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */
#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */
+#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* 5-level EPT paging */
/* Aggregated APIC features 24-27 */
#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index b89b40f250e6..6149eabe200f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -30,12 +30,13 @@ struct x86_init_mpparse {
* @reserve_resources: reserve the standard resources for the
* platform
* @memory_setup: platform specific memory setup
- *
+ * @dmi_setup: platform specific DMI setup
*/
struct x86_init_resources {
void (*probe_roms)(void);
void (*reserve_resources)(void);
char *(*memory_setup)(void);
+ void (*dmi_setup)(void);
};
/**
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index a9088250770f..64fbd2dbc5b7 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,6 +62,11 @@ void xen_arch_unregister_cpu(int num);
#ifdef CONFIG_PVH
void __init xen_pvh_init(struct boot_params *boot_params);
void __init mem_map_via_hcall(struct boot_params *boot_params_p);
+#ifdef CONFIG_XEN_PVH
+void __init xen_reserve_extra_memory(struct boot_params *bootp);
+#else
+static inline void xen_reserve_extra_memory(struct boot_params *bootp) { }
+#endif
#endif
/* Lazy mode for batching updates / context switch */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a448d0964fc0..ef11aa4cab42 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -7,6 +7,8 @@
*
*/
+#include <linux/const.h>
+#include <linux/bits.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/stddef.h>
@@ -40,7 +42,6 @@
#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_MSI
#define __KVM_HAVE_USER_NMI
-#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_MSIX
#define __KVM_HAVE_MCE
#define __KVM_HAVE_PIT_STATE2
@@ -49,7 +50,6 @@
#define __KVM_HAVE_DEBUGREGS
#define __KVM_HAVE_XSAVE
#define __KVM_HAVE_XCRS
-#define __KVM_HAVE_READONLY_MEM
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -526,9 +526,301 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
-#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0)
#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+/* for KVM_CAP_MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+
+/* for KVM_CAP_XEN_HVM */
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
+
+struct kvm_xen_hvm_config {
+ __u32 flags;
+ __u32 msr;
+ __u64 blob_addr_32;
+ __u64 blob_addr_64;
+ __u8 blob_size_32;
+ __u8 blob_size_64;
+ __u8 pad2[30];
+};
+
+struct kvm_xen_hvm_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u8 long_mode;
+ __u8 vector;
+ __u8 runstate_update_flag;
+ union {
+ __u64 gfn;
+#define KVM_XEN_INVALID_GFN ((__u64)-1)
+ __u64 hva;
+ } shared_info;
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE (1 << 1)
+#define KVM_XEN_EVTCHN_RESET (1 << 2)
+ /*
+ * Events sent by the guest are either looped back to
+ * the guest itself (potentially on a different port#)
+ * or signalled via an eventfd.
+ */
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
+ __u64 pad[8];
+ } u;
+};
+
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
+#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6
+
+struct kvm_xen_vcpu_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u64 gpa;
+#define KVM_XEN_INVALID_GPA ((__u64)-1)
+ __u64 hva;
+ __u64 pad[8];
+ struct {
+ __u64 state;
+ __u64 state_entry_time;
+ __u64 time_running;
+ __u64 time_runnable;
+ __u64 time_blocked;
+ __u64 time_offline;
+ } runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
+ } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+ /* Guest initialization commands */
+ KVM_SEV_INIT = 0,
+ KVM_SEV_ES_INIT,
+ /* Guest launch commands */
+ KVM_SEV_LAUNCH_START,
+ KVM_SEV_LAUNCH_UPDATE_DATA,
+ KVM_SEV_LAUNCH_UPDATE_VMSA,
+ KVM_SEV_LAUNCH_SECRET,
+ KVM_SEV_LAUNCH_MEASURE,
+ KVM_SEV_LAUNCH_FINISH,
+ /* Guest migration commands (outgoing) */
+ KVM_SEV_SEND_START,
+ KVM_SEV_SEND_UPDATE_DATA,
+ KVM_SEV_SEND_UPDATE_VMSA,
+ KVM_SEV_SEND_FINISH,
+ /* Guest migration commands (incoming) */
+ KVM_SEV_RECEIVE_START,
+ KVM_SEV_RECEIVE_UPDATE_DATA,
+ KVM_SEV_RECEIVE_UPDATE_VMSA,
+ KVM_SEV_RECEIVE_FINISH,
+ /* Guest status and debug commands */
+ KVM_SEV_GUEST_STATUS,
+ KVM_SEV_DBG_DECRYPT,
+ KVM_SEV_DBG_ENCRYPT,
+ /* Guest certificates commands */
+ KVM_SEV_CERT_EXPORT,
+ /* Attestation report */
+ KVM_SEV_GET_ATTESTATION_REPORT,
+ /* Guest Migration Extension */
+ KVM_SEV_SEND_CANCEL,
+
+ KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+ __u32 id;
+ __u32 pad0;
+ __u64 data;
+ __u32 error;
+ __u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 dh_uaddr;
+ __u32 dh_len;
+ __u32 pad0;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad1;
+};
+
+struct kvm_sev_launch_update_data {
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+
+struct kvm_sev_launch_secret {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+struct kvm_sev_launch_measure {
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_guest_status {
+ __u32 handle;
+ __u32 policy;
+ __u32 state;
+};
+
+struct kvm_sev_dbg {
+ __u64 src_uaddr;
+ __u64 dst_uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_attestation_report {
+ __u8 mnonce[16];
+ __u64 uaddr;
+ __u32 len;
+ __u32 pad0;
+};
+
+struct kvm_sev_send_start {
+ __u32 policy;
+ __u32 pad0;
+ __u64 pdh_cert_uaddr;
+ __u32 pdh_cert_len;
+ __u32 pad1;
+ __u64 plat_certs_uaddr;
+ __u32 plat_certs_len;
+ __u32 pad2;
+ __u64 amd_certs_uaddr;
+ __u32 amd_certs_len;
+ __u32 pad3;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad4;
+};
+
+struct kvm_sev_send_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+struct kvm_sev_receive_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 pdh_uaddr;
+ __u32 pdh_len;
+ __u32 pad0;
+ __u64 session_uaddr;
+ __u32 session_len;
+ __u32 pad1;
+};
+
+struct kvm_sev_receive_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u32 pad0;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u32 pad1;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+ __u32 pad2;
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
+struct kvm_hyperv_eventfd {
+ __u32 conn_id;
+ __s32 fd;
+ __u32 flags;
+ __u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+
/*
* Masked event layout.
* Bits Description
@@ -549,10 +841,10 @@ struct kvm_pmu_event_filter {
((__u64)(!!(exclude)) << 55))
#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
- (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8))
-#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55))
+ (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55))
#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
@@ -560,7 +852,7 @@ struct kvm_pmu_event_filter {
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
/* x86-specific KVM_EXIT_HYPERCALL flags. */
-#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0)
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6e64b27b2c1e..a1efa7907a0b 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -92,7 +92,7 @@ struct kvm_clock_pairing {
#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3)
/* MSR_KVM_ASYNC_PF_INT */
-#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0)
+#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0)
/* MSR_KVM_MIGRATION_CONTROL */
#define KVM_MIGRATION_READY (1 << 0)
@@ -142,7 +142,6 @@ struct kvm_vcpu_pv_apf_data {
__u32 token;
__u8 pad[56];
- __u32 enabled;
};
#define KVM_PV_EOI_BIT 0
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d0c744cb2a0e..5d128167e2e2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -62,7 +62,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += resource.o
@@ -100,11 +100,11 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_RETHOOK) += rethook.o
-obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_X86_32) += doublefault_32.o
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 8d8752b44f11..ff8f25faca3d 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -20,7 +20,7 @@ bool cpc_supported_by_cpu(void)
(boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
return true;
else if (boot_cpu_data.x86 == 0x17 &&
- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
return true;
return boot_cpu_has(X86_FEATURE_CPPC);
}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 401808b47af3..f3ffd0a3a012 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -131,8 +131,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
- MWAIT_CSTATE_MASK) + 1;
+ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ff6e32ec8259..7555c15b7183 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -125,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
};
/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
* Fill the buffer with a single effective instruction of size @len.
*
* In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
@@ -133,28 +147,28 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *target = instr + len;
+ u8 *target = buf + len;
if (!len)
return;
if (len <= ASM_NOP_MAX) {
- memcpy(instr, x86_nops[len], len);
+ memcpy(buf, x86_nops[len], len);
return;
}
if (len < 128) {
- __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
- instr += JMP8_INSN_SIZE;
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
} else {
- __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
- instr += JMP32_INSN_SIZE;
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
}
- for (;instr < target; instr++)
- *instr = INT3_INSN_OPCODE;
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn)
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
{
struct insn insn;
for (; offset < len; offset += insn.length) {
- if (insn_decode_kernel(&insn, &instr[offset]))
+ if (insn_decode_kernel(&insn, &buf[offset]))
break;
if (!insn_is_nop(&insn))
@@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len)
}
/*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
- int i = *next - insn->length;
-
- switch (insn->opcode.bytes[0]) {
- case JMP8_INSN_OPCODE:
- case JMP32_INSN_OPCODE:
- *prev = i;
- *target = *next + insn->immediate.value;
- return false;
- }
-
- if (insn_is_nop(insn)) {
- int nop = i;
-
- *next = skip_nops(instr, *next, len);
- if (*target && *next == *target)
- nop = *prev;
-
- add_nop(instr + nop, *next - nop);
- DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
- return true;
- }
-
- *target = 0;
- return false;
-}
-
-/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- int prev, target = 0;
-
for (int next, i = 0; i < len; i = next) {
struct insn insn;
- if (insn_decode_kernel(&insn, &instr[i]))
+ if (insn_decode_kernel(&insn, &buf[i]))
return;
next = i + insn.length;
- __optimize_nops(instr, len, &insn, &next, &prev, &target);
- }
-}
+ if (insn_is_nop(&insn)) {
+ int nop = i;
-static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
-{
- unsigned long flags;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
- local_irq_save(flags);
- optimize_nops(instr, len);
- sync_core();
- local_irq_restore(flags);
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
/*
@@ -335,11 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- int prev, target = 0;
-
- for (int next, i = 0; i < len; i = next) {
+ for (int next, i = 0; i < instrlen; i = next) {
struct insn insn;
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -347,9 +325,6 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
next = i + insn.length;
- if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
- continue;
-
switch (insn.opcode.bytes[0]) {
case 0x0f:
if (insn.opcode.bytes[1] < 0x80 ||
@@ -361,10 +336,10 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
case JMP8_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case CALL_INSN_OPCODE:
- if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
apply_reloc(insn.immediate.nbytes,
buf + i + insn_offset_immediate(&insn),
- src - dest);
+ repl - instr);
}
/*
@@ -372,7 +347,7 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
*/
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
s32 imm = insn.immediate.value;
- imm += src - dest;
+ imm += repl - instr;
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
if ((imm >> 31) == (imm >> 7)) {
buf[i+0] = JMP8_INSN_OPCODE;
@@ -385,15 +360,21 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
if (insn_rip_relative(&insn)) {
- if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
apply_reloc(insn.displacement.nbytes,
buf + i + insn_offset_displacement(&insn),
- src - dest);
+ repl - instr);
}
}
}
}
+void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, repl_len);
+}
+
/* Low-level backend functions usable from alternative code replacements. */
DEFINE_ASM_FUNC(nop_func, "", .entry.text);
EXPORT_SYMBOL_GPL(nop_func);
@@ -464,9 +445,9 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a)
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
- struct alt_instr *a;
- u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a;
DPRINTK(ALT, "alt table %px, -> %px", start, end);
@@ -504,7 +485,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops_inplace(instr, a->instrlen);
+ memcpy(insn_buff, instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(instr, insn_buff, a->instrlen);
continue;
}
@@ -526,7 +509,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+ apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -761,7 +744,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
- optimize_nops(bytes, len);
+ optimize_nops(addr, bytes, len);
DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
text_poke_early(addr, bytes, len);
@@ -1804,7 +1787,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm(smp_processor_id());
+ leave_mm();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 5bf5f9fc5753..3cf156f70859 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -95,6 +95,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
{}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a42d8a6f7149..66fd4b2a37a3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -497,32 +497,32 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
+ X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
+ X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
+ X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
{},
};
@@ -631,7 +631,7 @@ void lapic_update_tsc_freq(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
@@ -641,7 +641,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
if (boot_cpu_has(X86_FEATURE_TSC))
tsc = rdtsc();
@@ -666,7 +666,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -677,7 +677,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -1687,11 +1687,11 @@ static int x2apic_state;
static bool x2apic_hw_locked(void)
{
- u64 ia32_cap;
+ u64 x86_arch_cap_msr;
u64 msr;
- ia32_cap = x86_read_arch_cap_msr();
- if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) {
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
return (msr & LEGACY_XAPIC_DISABLED);
}
@@ -1771,7 +1771,7 @@ void x2apic_setup(void)
__x2apic_enable();
}
-static __init void apic_set_fixmap(void);
+static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
@@ -1793,7 +1793,12 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
- apic_set_fixmap();
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
}
static __init void x2apic_enable(void)
@@ -2057,13 +2062,14 @@ void __init init_apic_mappings(void)
}
}
-static __init void apic_set_fixmap(void)
+static __init void apic_set_fixmap(bool read_apic)
{
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
apic_mmio_base = APIC_BASE;
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
apic_mmio_base, mp_lapic_addr);
- apic_read_boot_cpu_id(false);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
}
void __init register_lapic_address(unsigned long address)
@@ -2073,7 +2079,7 @@ void __init register_lapic_address(unsigned long address)
mp_lapic_addr = address;
if (!x2apic_mode)
- apic_set_fixmap();
+ apic_set_fixmap(true);
}
/*
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..9eec52925fa3 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
lockdep_assert_held(&vector_lock);
hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
- unsigned int irr, vector = apicd->prev_vector;
+ unsigned int vector = apicd->prev_vector;
/*
* Paranoia: Check if the vector that needs to be cleaned
@@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
* fixup_irqs() was just called to scan IRR for set bits and
* forward them to new destination CPUs via IPIs.
*/
- irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0;
- if (irr & (1U << (vector % 32))) {
+ if (check_irr && is_vector_pending(vector)) {
pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
rearm = true;
continue;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 567dbd2fe4b6..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu)
u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
u32 cluster = apic_cluster(phys_apicid);
u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
x86_cpu_to_logical_apicid[cpu] = logical_apicid;
- if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
+ if (alloc_clustermask(cpu, cluster, node) < 0)
return -ENOMEM;
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
return -ENOMEM;
+
return 0;
}
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 30335182b6b0..465647456753 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -185,8 +185,7 @@ static void *patch_dest(void *dest, bool direct)
u8 *pad = dest - tsize;
memcpy(insn_buff, skl_call_thunk_template, tsize);
- apply_relocation(insn_buff, tsize, pad,
- skl_call_thunk_template, tsize);
+ apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
/* Already patched? */
if (!bcmp(pad, insn_buff, tsize))
@@ -308,13 +307,12 @@ static bool is_callthunk(void *addr)
pad = (void *)(dest - tmpl_size);
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, pad,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
return !bcmp(pad, insn_buff, tmpl_size);
}
-int x86_call_depth_emit_accounting(u8 **pprog, void *func)
+int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
{
unsigned int tmpl_size = SKL_TMPL_SIZE;
u8 insn_buff[MAX_PATCH_LEN];
@@ -327,8 +325,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func)
return 0;
memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
- apply_relocation(insn_buff, tmpl_size, *pprog,
- skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3282a747b645..44df3f11e731 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -13,6 +13,7 @@
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -345,6 +346,28 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
+
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined for.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ c->x86 >= 0x19 && snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
+ }
+#endif
+}
+
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -437,8 +460,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
case 0x1a:
switch (c->x86_model) {
- case 0x00 ... 0x0f:
- case 0x20 ... 0x2f:
+ case 0x00 ... 0x2f:
case 0x40 ... 0x4f:
case 0x70 ... 0x7f:
setup_force_cpu_cap(X86_FEATURE_ZEN5);
@@ -452,21 +474,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
break;
}
- if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
- /*
- * RMP table entry format is not architectural and it can vary by processor
- * and is defined by the per-processor PPR. Restrict SNP support on the
- * known CPU model and family for which the RMP table entry format is
- * currently defined for.
- */
- if (!boot_cpu_has(X86_FEATURE_ZEN3) &&
- !boot_cpu_has(X86_FEATURE_ZEN4) &&
- !boot_cpu_has(X86_FEATURE_ZEN5))
- setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
- else if (!snp_probe_rmptable_info())
- setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
- }
-
+ bsp_determine_snp(c);
return;
warn:
@@ -527,7 +535,6 @@ clear_sev:
static void early_init_amd(struct cpuinfo_x86 *c)
{
- u64 value;
u32 dummy;
if (c->x86 >= 0xf)
@@ -595,20 +602,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
early_detect_mem_encrypt(c);
- /* Re-enable TopologyExtensions if switched off by BIOS */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (msr_set_bit(0xc0011005, 54) > 0) {
- rdmsrl(0xc0011005, value);
- if (value & BIT_64(54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
- }
- }
- }
-
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
@@ -802,6 +795,11 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_desc erratum_1386_microcode[] = {
+ AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
+ AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
+};
+
static void fix_erratum_1386(struct cpuinfo_x86 *c)
{
/*
@@ -811,7 +809,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
*
* Affected parts all have no supervisor XSAVE states, meaning that
* the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
*/
+ if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
+ return;
+
clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
@@ -864,11 +868,11 @@ static bool cpu_has_zenbleed_microcode(void)
u32 good_rev = 0;
switch (boot_cpu_data.x86_model) {
- case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
- case 0x60 ... 0x67: good_rev = 0x0860010b; break;
- case 0x68 ... 0x6f: good_rev = 0x08608105; break;
- case 0x70 ... 0x7f: good_rev = 0x08701032; break;
- case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
default:
return false;
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index fdbb5f07448f..f9a8c7b7943f 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e7ba936d798b..b6f927f6c567 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,7 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
@@ -61,6 +61,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
EXPORT_SYMBOL_GPL(x86_pred_cmd);
+static u64 __ro_after_init x86_arch_cap_msr;
+
static DEFINE_MUTEX(spec_ctrl_mutex);
void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
@@ -144,6 +146,8 @@ void __init cpu_select_mitigations(void)
x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
}
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -301,8 +305,6 @@ static const char * const taa_strings[] = {
static void __init taa_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
taa_mitigation = TAA_MITIGATION_OFF;
return;
@@ -341,9 +343,8 @@ static void __init taa_select_mitigation(void)
* On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
* update is required.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
- !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
/*
@@ -401,8 +402,6 @@ static const char * const mmio_strings[] = {
static void __init mmio_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) ||
cpu_mitigations_off()) {
@@ -413,8 +412,6 @@ static void __init mmio_select_mitigation(void)
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
- ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
* by MDS or TAA. Otherwise, enable mitigation for VMM only.
@@ -437,7 +434,7 @@ static void __init mmio_select_mitigation(void)
* be propagated to uncore buffers, clearing the Fill buffers on idle
* is required irrespective of SMT state.
*/
- if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
/*
@@ -447,10 +444,10 @@ static void __init mmio_select_mitigation(void)
* FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
* affected systems.
*/
- if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
(boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)))
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))
mmio_mitigation = MMIO_MITIGATION_VERW;
else
mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
@@ -508,7 +505,7 @@ static void __init rfds_select_mitigation(void)
if (rfds_mitigation == RFDS_MITIGATION_OFF)
return;
- if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR)
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
else
rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
@@ -659,8 +656,6 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return;
@@ -669,8 +664,7 @@ static void __init srbds_select_mitigation(void)
* are only exposed to SRBDS when TSX is enabled or when CPU is affected
* by Processor MMIO Stale Data vulnerability.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
@@ -813,7 +807,7 @@ static void __init gds_select_mitigation(void)
/* Will verify below that mitigation _can_ be disabled */
/* No microcode */
- if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
if (gds_mitigation == GDS_MITIGATION_FORCE) {
/*
* This only needs to be done on the boot CPU so do it
@@ -1544,20 +1538,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
return SPECTRE_V2_RETPOLINE;
}
+static bool __ro_after_init rrsba_disabled;
+
/* Disable in-kernel use of non-RSB RET predictors */
static void __init spec_ctrl_disable_kernel_rrsba(void)
{
- u64 ia32_cap;
+ if (rrsba_disabled)
+ return;
- if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
return;
+ }
- ia32_cap = x86_read_arch_cap_msr();
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
- if (ia32_cap & ARCH_CAP_RRSBA) {
- x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- update_spec_ctrl(x86_spec_ctrl_base);
- }
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
}
static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
@@ -1607,6 +1606,74 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
dump_stack();
}
+/*
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
+ */
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_ON,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Mitigate KVM by default */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ pr_info("Spectre BHI mitigation: SW BHB clearing on vm exit\n");
+
+ /* Mitigate syscalls when the mitigation is forced =on */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall\n");
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -1718,6 +1785,9 @@ static void __init spectre_v2_select_mitigation(void)
mode == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
+ if (boot_cpu_has(X86_BUG_BHI))
+ bhi_select_mitigation();
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -1832,8 +1902,6 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1848,7 +1916,7 @@ static void update_mds_branch_idle(void)
if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
- (ia32_cap & ARCH_CAP_FBSDP_NO)) {
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
}
}
@@ -2323,20 +2391,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -2695,15 +2763,15 @@ static char *stibp_state(void)
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
- return ", STIBP: disabled";
+ return "; STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
- return ", STIBP: forced";
+ return "; STIBP: forced";
case SPECTRE_V2_USER_STRICT_PREFERRED:
- return ", STIBP: always-on";
+ return "; STIBP: always-on";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
- return ", STIBP: conditional";
+ return "; STIBP: conditional";
}
return "";
}
@@ -2712,10 +2780,10 @@ static char *ibpb_state(void)
{
if (boot_cpu_has(X86_FEATURE_IBPB)) {
if (static_key_enabled(&switch_mm_always_ibpb))
- return ", IBPB: always-on";
+ return "; IBPB: always-on";
if (static_key_enabled(&switch_mm_cond_ibpb))
- return ", IBPB: conditional";
- return ", IBPB: disabled";
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
}
return "";
}
@@ -2725,14 +2793,32 @@ static char *pbrsb_eibrs_state(void)
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
- return ", PBRSB-eIBRS: SW sequence";
+ return "; PBRSB-eIBRS: SW sequence";
else
- return ", PBRSB-eIBRS: Vulnerable";
+ return "; PBRSB-eIBRS: Vulnerable";
} else {
- return ", PBRSB-eIBRS: Not affected";
+ return "; PBRSB-eIBRS: Not affected";
}
}
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
@@ -2745,13 +2831,15 @@ static ssize_t spectre_v2_show_state(char *buf)
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sysfs_emit(buf, "%s%s%s%s%s%s%s\n",
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
spectre_v2_strings[spectre_v2_enabled],
ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
spectre_v2_module_string());
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ba8cf5e9ce56..2b170da84f97 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -68,6 +68,7 @@
#include <asm/traps.h>
#include <asm/sev.h>
#include <asm/tdx.h>
+#include <asm/posted_intr.h>
#include "cpu.h"
@@ -114,17 +115,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -1053,18 +1054,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- bool vp_bits_from_cpuid = true;
if (!cpu_has(c, X86_FEATURE_CPUID) ||
- (c->extended_cpuid_level < 0x80000008))
- vp_bits_from_cpuid = false;
-
- if (vp_bits_from_cpuid) {
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- } else {
+ (c->extended_cpuid_level < 0x80000008)) {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1078,7 +1070,13 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
}
+
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
@@ -1120,12 +1118,13 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define NO_SPECTRE_V2 BIT(8)
#define NO_MMIO BIT(9)
#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1142,32 +1141,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(TIGERLAKE, NO_MMIO),
- VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1177,33 +1176,31 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
/* Zhaoxin Family 7 */
- VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
- VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
{}
};
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
+ X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1228,43 +1225,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
- VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
@@ -1283,25 +1280,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi
u64 x86_read_arch_cap_msr(void)
{
- u64 ia32_cap = 0;
+ u64 x86_arch_cap_msr = 0;
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
- return ia32_cap;
+ return x86_arch_cap_msr;
}
-static bool arch_cap_mmio_immune(u64 ia32_cap)
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
{
- return (ia32_cap & ARCH_CAP_FBSDP_NO &&
- ia32_cap & ARCH_CAP_PSDP_NO &&
- ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
}
-static bool __init vulnerable_to_rfds(u64 ia32_cap)
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
{
/* The "immunity" bit trumps everything else: */
- if (ia32_cap & ARCH_CAP_RFDS_NO)
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
return false;
/*
@@ -1309,7 +1306,7 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap)
* indicate that mitigation is needed because guest is running on a
* vulnerable hardware or may migrate to such hardware:
*/
- if (ia32_cap & ARCH_CAP_RFDS_CLEAR)
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
return true;
/* Only consult the blacklist when there is no enumeration: */
@@ -1318,11 +1315,11 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap)
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
- !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
@@ -1334,7 +1331,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
- !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
@@ -1345,17 +1342,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* Don't use AutoIBRS when SNP is enabled because it degrades host
* userspace indirect branch performance.
*/
- if ((ia32_cap & ARCH_CAP_IBRS_ALL) ||
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
(cpu_has(c, X86_FEATURE_AUTOIBRS) &&
!cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
- !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
}
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
@@ -1374,9 +1371,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* TSX_CTRL check alone is not sufficient for cases when the microcode
* update is not present or running as guest that don't get TSX_CTRL.
*/
- if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
(cpu_has(c, X86_FEATURE_RTM) ||
- (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
/*
@@ -1402,7 +1399,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist,
* nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.
*/
- if (!arch_cap_mmio_immune(ia32_cap)) {
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
if (cpu_matches(cpu_vuln_blacklist, MMIO))
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO))
@@ -1410,7 +1407,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
}
if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
- if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
@@ -1428,18 +1425,25 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* disabling AVX2. The only way to do this in HW is to clear XCR0[2],
* which means that AVX will be disabled.
*/
- if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
boot_cpu_has(X86_FEATURE_AVX))
setup_force_cpu_bug(X86_BUG_GDS);
- if (vulnerable_to_rfds(ia32_cap))
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
setup_force_cpu_bug(X86_BUG_RFDS);
+ /* When virtualized, eIBRS could be hidden, assume vulnerable */
+ if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
+ !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
- if (ia32_cap & ARCH_CAP_RDCL_NO)
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
@@ -2219,6 +2223,8 @@ void cpu_init(void)
barrier();
x2apic_setup();
+
+ intel_posted_msi_init();
}
mmgrab(&init_mm);
@@ -2307,6 +2313,8 @@ void arch_smt_update(void)
void __init arch_cpu_finalize_init(void)
{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+
identify_boot_cpu();
select_idle_routine();
@@ -2345,6 +2353,13 @@ void __init arch_cpu_finalize_init(void)
fpu__init_system();
fpu__init_cpu();
+ /*
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
+ */
+ *c = boot_cpu_data;
+ c->initialized = true;
+
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index b7174209d855..b7d9f530ae16 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -44,7 +44,10 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_F16C, X86_FEATURE_XMM2, },
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
{ X86_FEATURE_AVX2, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
@@ -56,9 +59,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
@@ -114,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 03851240c3e3..1640ae76548f 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
if (ept & VMX_EPT_1GB_PAGE_BIT)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
/* Synthetic APIC features that are aggregates of multiple features. */
if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be30d7fa2e66..3c3e7e5695ba 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -228,6 +228,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c)
if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
pr_info_once("x86/tme: not enabled by BIOS\n");
mktme_status = MKTME_DISABLED;
+ clear_cpu_cap(c, X86_FEATURE_TME);
return;
}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f18d35fe27a9..30b1d63b97f3 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60..8651643bddae 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -17,8 +17,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +25,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index b5cc557cfc37..ad0623b659ed 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -47,7 +47,7 @@
#include <linux/kexec.h>
#include <asm/fred.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -1593,6 +1593,24 @@ noinstr void do_machine_check(struct pt_regs *regs)
else
queue_task_work(&m, msg, kill_me_maybe);
+ } else if (m.mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(&m)) {
+ struct page *p = pfn_to_online_page(m.addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1930,14 +1948,14 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
cfg->bootlog = 0;
- if (c->x86 == 6 && c->x86_model == 45)
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
mce_flags.snb_ifu_quirk = 1;
/*
* Skylake, Cascacde Lake and Cooper Lake require a quirk on
* rep movs.
*/
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
mce_flags.skx_repmov_quirk = 1;
}
@@ -2500,12 +2518,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
-
if (!b->init)
return -ENODEV;
b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
return size;
}
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..4284749ec803 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -118,22 +118,32 @@ int mce_gen_pool_add(struct mce *mce)
static int mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
-
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return ret;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return ret;
+ }
+ ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1);
if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return ret;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
return ret;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 399b62e223d2..f6103e6bf69a 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -455,10 +455,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
@@ -484,12 +484,11 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..dac4d64dfb2a 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
switch (fixup_type) {
case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
@@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806d..c0d56c02b8da 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -84,8 +84,6 @@ struct microcode_amd {
unsigned int mpb[];
};
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
@@ -465,7 +463,7 @@ static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, siz
return !__apply_microcode_amd(mc);
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, u8 family)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct firmware fw;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 232026a239a6..b3658d11e7b6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
*/
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
/*
* Those patch levels cannot be updated to newer ones and thus should be final.
*/
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5f0414452b67..815fa67356a2 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -21,7 +21,7 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -577,8 +577,7 @@ static bool is_blacklisted(unsigned int cpu)
* This behavior is documented in item BDF90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 45e0e70e238c..e0fd57a8ba84 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -45,70 +45,70 @@ bool hyperv_paravisor_present __ro_after_init;
EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
#if IS_ENABLED(CONFIG_HYPERV)
-static inline unsigned int hv_get_nested_reg(unsigned int reg)
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
{
- if (hv_is_sint_reg(reg))
- return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0;
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
switch (reg) {
- case HV_REGISTER_SIMP:
- return HV_REGISTER_NESTED_SIMP;
- case HV_REGISTER_SIEFP:
- return HV_REGISTER_NESTED_SIEFP;
- case HV_REGISTER_SVERSION:
- return HV_REGISTER_NESTED_SVERSION;
- case HV_REGISTER_SCONTROL:
- return HV_REGISTER_NESTED_SCONTROL;
- case HV_REGISTER_EOM:
- return HV_REGISTER_NESTED_EOM;
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
default:
return reg;
}
}
-u64 hv_get_non_nested_register(unsigned int reg)
+u64 hv_get_non_nested_msr(unsigned int reg)
{
u64 value;
- if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present)
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
hv_ivm_msr_read(reg, &value);
else
rdmsrl(reg, value);
return value;
}
-EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
-void hv_set_non_nested_register(unsigned int reg, u64 value)
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
{
- if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) {
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
hv_ivm_msr_write(reg, value);
/* Write proxy bit via wrmsl instruction */
- if (hv_is_sint_reg(reg))
+ if (hv_is_sint_msr(reg))
wrmsrl(reg, value | 1 << 20);
} else {
wrmsrl(reg, value);
}
}
-EXPORT_SYMBOL_GPL(hv_set_non_nested_register);
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
-u64 hv_get_register(unsigned int reg)
+u64 hv_get_msr(unsigned int reg)
{
if (hv_nested)
- reg = hv_get_nested_reg(reg);
+ reg = hv_get_nested_msr(reg);
- return hv_get_non_nested_register(reg);
+ return hv_get_non_nested_msr(reg);
}
-EXPORT_SYMBOL_GPL(hv_get_register);
+EXPORT_SYMBOL_GPL(hv_get_msr);
-void hv_set_register(unsigned int reg, u64 value)
+void hv_set_msr(unsigned int reg, u64 value)
{
if (hv_nested)
- reg = hv_get_nested_reg(reg);
+ reg = hv_get_nested_msr(reg);
- hv_set_non_nested_register(reg, value);
+ hv_set_non_nested_msr(reg, value);
}
-EXPORT_SYMBOL_GPL(hv_set_register);
+EXPORT_SYMBOL_GPL(hv_set_msr);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
@@ -209,7 +209,9 @@ static void hv_machine_shutdown(void)
if (kexec_in_progress)
hyperv_cleanup();
}
+#endif /* CONFIG_KEXEC_CORE */
+#ifdef CONFIG_CRASH_DUMP
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
@@ -221,7 +223,7 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
/* Disable the hypercall page when there is only 1 active CPU. */
hyperv_cleanup();
}
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_DUMP */
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -350,13 +352,24 @@ static void __init reduced_hw_init(void)
x86_init.irqs.pre_vector_init = x86_init_noop;
}
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
- int hv_host_info_eax;
- int hv_host_info_ebx;
- int hv_host_info_ecx;
- int hv_host_info_edx;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -407,21 +420,6 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: running on a nested hypervisor\n");
}
- /*
- * Extract host information.
- */
- if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
-
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
- hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF,
- hv_host_info_eax, hv_host_info_edx & 0xFFFFFF,
- hv_host_info_ecx, hv_host_info_edx >> 24);
- }
-
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
@@ -454,7 +452,7 @@ static void __init ms_hyperv_init_platform(void)
/* To be supported: more work is required. */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
- /* HV_REGISTER_CRASH_CTL is unsupported. */
+ /* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
/* Don't trust Hyper-V's TLB-flushing hypercalls. */
@@ -495,10 +493,14 @@ static void __init ms_hyperv_init_platform(void)
no_timer_check = 1;
#endif
-#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+#if IS_ENABLED(CONFIG_HYPERV)
+#if defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
+#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
/*
* Writing to synthetic MSR 0x40000118 updates/changes the
@@ -642,6 +644,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
#ifdef CONFIG_AMD_MEM_ENCRYPT
.runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
.runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 422a4ddc2ab7..7b29ebda024f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -108,7 +108,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return;
rdmsr(MSR_AMD64_SYSCFG, lo, hi);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 83e40341583e..a113d9aba553 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -22,7 +22,7 @@
#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include "internal.h"
@@ -56,14 +56,9 @@ int max_name_width, max_data_width;
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
@@ -309,12 +304,11 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -334,25 +328,22 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return r->default_ctrl;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(m->dom);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -362,6 +353,8 @@ struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
{
struct rdt_domain *d;
+ lockdep_assert_cpus_held();
+
list_for_each_entry(d, &r->domains, list) {
/* Find the domain that contains this CPU */
if (cpumask_test_cpu(cpu, &d->cpu_mask))
@@ -378,19 +371,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
/*
@@ -463,9 +448,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
hw_dom->ctrl_val = dc;
setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
@@ -821,18 +808,18 @@ static __init bool get_rdt_mon_resources(void)
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 7997b47743a2..b7291f60399c 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -272,22 +272,6 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
}
}
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
-
- if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
@@ -302,9 +286,10 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
hw_dom->ctrl_val[idx] = cfg_val;
msr_param.res = r;
+ msr_param.dom = d;
msr_param.low = idx;
msr_param.high = idx + 1;
- hw_res->msr_update(d, &msr_param, r);
+ hw_res->msr_update(&msr_param);
return 0;
}
@@ -315,48 +300,39 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
struct rdt_domain *d;
u32 idx;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
- msr_param.res = NULL;
list_for_each_entry(d, &r->domains, list) {
hw_dom = resctrl_to_arch_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- if (cpumask_empty(cpu_mask))
- goto done;
-
- /* Update resource control msr on all the CPUs. */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c99f26ebe7a6..f1d926832ec8 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -78,7 +78,8 @@ cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
else
cpu = cpumask_any_but(mask, exclude_cpu);
- if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
+ /* Only continue if tick_nohz_full_mask has been initialized. */
+ if (!tick_nohz_full_enabled())
return cpu;
/* If the CPU picked isn't marked nohz_full nothing more needs doing. */
@@ -378,11 +379,13 @@ static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_domain *dom;
u32 low;
u32 high;
};
@@ -442,8 +445,7 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
unsigned int mbm_cfg_mask;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c34a35ec0f03..2345e6836593 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -24,6 +24,7 @@
#include <asm/resctrl.h>
#include "internal.h"
+#include "trace.h"
/**
* struct rmid_entry - dirty tracking for all RMID.
@@ -354,6 +355,16 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->id, val);
}
if (force_free || !rmid_dirty) {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 884b88e25141..aacf236dfe3b 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -23,7 +23,7 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
@@ -31,7 +31,7 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -88,8 +88,8 @@ static u64 get_prefetch_disable_bits(void)
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -100,8 +100,8 @@ static u64 get_prefetch_disable_bits(void)
* 63:4 Reserved
*/
return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -1084,9 +1084,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1123,8 +1123,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1142,7 +1142,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 011e17efb1a6..02f213f1c51c 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2813,16 +2813,12 @@ static int reset_all_ctrls(struct rdt_resource *r)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
struct rdt_domain *d;
int i;
/* Walking r->domains, ensure it can't race with cpuhp */
lockdep_assert_cpus_held();
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
-
msr_param.res = r;
msr_param.low = 0;
msr_param.high = hw_res->num_closid;
@@ -2834,17 +2830,13 @@ static int reset_all_ctrls(struct rdt_resource *r)
*/
list_for_each_entry(d, &r->domains, list) {
hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
+ msr_param.dom = d;
+ smp_call_function_any(&d->cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /* Update CBM on all the CPUs in cpu_mask */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
- free_cpumask_var(cpu_mask);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b..2a506316b303 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
#include <linux/tracepoint.h>
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _TRACE_RESCTRL_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 0dad49a09b7a..af5aa2c754c2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
{ X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
@@ -49,6 +50,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 3259b1d4fefe..d17c9b71eb4a 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -123,7 +123,6 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
- set_cpu_possible(cpu, true);
set_cpu_present(cpu, true);
}
@@ -210,7 +209,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
topo_info.nr_disabled_cpus++;
}
- /* Register present and possible CPUs in the domain maps */
+ /*
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
+ */
for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}
@@ -415,6 +418,17 @@ void __init topology_init_possible_cpus(void)
unsigned int total = assigned + disabled;
u32 apicid, firstid;
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
if (!restrict_to_up()) {
if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
disabled += assigned - nr_cpu_ids;
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
index 1a8b3ad493af..d419deed6a48 100644
--- a/arch/x86/kernel/cpu/topology_amd.c
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -29,11 +29,21 @@ static bool parse_8000_0008(struct topo_scan *tscan)
if (!sft)
sft = get_count_order(ecx.cpu_nthreads + 1);
- topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
return true;
}
-static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id)
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
{
/*
* Starting with Fam 17h the DIE domain could probably be used to
@@ -48,7 +58,7 @@ static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_
tscan->amd_node_id = node_id;
}
-static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
{
struct {
// eax
@@ -73,12 +83,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
tscan->c->topo.initial_apicid = leaf.ext_apic_id;
/*
- * If leaf 0xb is available, then SMT shift is set already. If not
- * take it from ecx.threads_per_core and use topo_update_dom() -
- * topology_set_dom() would propagate and overwrite the already
- * propagated CORE level.
+ * If leaf 0xb is available, then the domain shifts are set
+ * already and nothing to do here.
*/
- if (!has_0xb) {
+ if (!has_topoext) {
+ /*
+ * Leaf 0x80000008 set the CORE domain shift already.
+ * Update the SMT domain, but do not propagate it.
+ */
unsigned int nthreads = leaf.core_nthreads + 1;
topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads);
@@ -107,64 +119,86 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb)
return true;
}
-static bool parse_fam10h_node_id(struct topo_scan *tscan)
+static void parse_fam10h_node_id(struct topo_scan *tscan)
{
- struct {
- union {
+ union {
+ struct {
u64 node_id : 3,
nodes_per_pkg : 3,
unused : 58;
- u64 msr;
};
+ u64 msr;
} nid;
if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
- return false;
+ return;
rdmsrl(MSR_FAM10H_NODE_ID, nid.msr);
store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
tscan->c->topo.llc_id = nid.node_id;
- return true;
}
static void legacy_set_llc(struct topo_scan *tscan)
{
unsigned int apicid = tscan->c->topo.initial_apicid;
- /* parse_8000_0008() set everything up except llc_id */
- tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(0xc0011005, 54) <= 0)
+ return;
+
+ rdmsrl(0xc0011005, msrval);
+ if (msrval & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
}
static void parse_topology_amd(struct topo_scan *tscan)
{
- bool has_0xb = false;
+ bool has_topoext = false;
/*
* If the extended topology leaf 0x8000_001e is available
- * try to get SMT and CORE shift from leaf 0xb first, then
- * try to get the CORE shift from leaf 0x8000_0008.
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
*/
if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
- has_0xb = cpu_parse_topology_ext(tscan);
+ has_topoext = cpu_parse_topology_ext(tscan);
- if (!has_0xb && !parse_8000_0008(tscan))
+ if (!has_topoext && !parse_8000_0008(tscan))
return;
/* Prefer leaf 0x8000001e if available */
- if (parse_8000_001e(tscan, has_0xb))
+ if (parse_8000_001e(tscan, has_topoext))
return;
/* Try the NODEID MSR */
- if (parse_fam10h_node_id(tscan))
- return;
-
- legacy_set_llc(tscan);
+ parse_fam10h_node_id(tscan);
}
void cpu_parse_topology_amd(struct topo_scan *tscan)
{
tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
if (tscan->amd_nodes_per_pkg > 1)
set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
index a50ae8d63d1c..9a6069e7133c 100644
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -140,7 +140,7 @@ static void parse_topology(struct topo_scan *tscan, bool early)
}
}
-static void topo_set_ids(struct topo_scan *tscan)
+static void topo_set_ids(struct topo_scan *tscan, bool early)
{
struct cpuinfo_x86 *c = tscan->c;
u32 apicid = c->topo.apicid;
@@ -148,8 +148,10 @@ static void topo_set_ids(struct topo_scan *tscan)
c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
- c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
- c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ }
/* Package relative core ID */
c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
@@ -187,7 +189,7 @@ void cpu_parse_topology(struct cpuinfo_x86 *c)
tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
}
- topo_set_ids(&tscan);
+ topo_set_ids(&tscan, false);
}
void __init cpu_init_topology(struct cpuinfo_x86 *c)
@@ -208,7 +210,7 @@ void __init cpu_init_topology(struct cpuinfo_x86 *c)
x86_topo_system.dom_size[dom] = 1U << sft;
}
- topo_set_ids(&tscan);
+ topo_set_ids(&tscan, true);
/*
* AMD systems have Nodes per package which cannot be mapped to
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
index e477228cd5b2..467b0326bf1a 100644
--- a/arch/x86/kernel/cpu/topology_ext.c
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -13,7 +13,10 @@ enum topo_types {
CORE_TYPE = 2,
MAX_TYPE_0B = 3,
MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
DIE_TYPE = 5,
DIEGRP_TYPE = 6,
MAX_TYPE_1F = 7,
@@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
[DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
};
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
unsigned int *last_dom)
{
@@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
switch (leaf) {
case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
default: return false;
}
@@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan)
if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
return true;
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
/* Intel/AMD: Fall back to leaf 0xB if available */
return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 4aeafe63521b..8e3c53b4d070 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -24,6 +24,7 @@
#include <asm/pci_x86.h>
#include <asm/setup.h>
#include <asm/i8259.h>
+#include <asm/numa.h>
#include <asm/prom.h>
__initdata u64 initial_dtb;
@@ -137,6 +138,7 @@ static void __init dtb_cpu_setup(void)
continue;
}
topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
}
}
@@ -277,36 +279,40 @@ static void __init dtb_apic_setup(void)
dtb_ioapic_setup();
}
-#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
void __init x86_flattree_get_config(void)
{
+#ifdef CONFIG_OF_EARLY_FLATTREE
u32 size, map_len;
void *dt;
- if (!initial_dtb)
- return;
+ if (initial_dtb) {
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
- dt = early_memremap(initial_dtb, map_len);
- size = fdt_totalsize(dt);
- if (map_len < size) {
- early_memunmap(dt, map_len);
- dt = early_memremap(initial_dtb, size);
- map_len = size;
+ early_init_dt_verify(dt);
}
- early_init_dt_verify(dt);
unflatten_and_copy_device_tree();
- early_memunmap(dt, map_len);
-}
-#endif
-void __init x86_dtb_parse_smp_config(void)
-{
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
+ if (initial_dtb)
+ early_memunmap(dt, map_len);
+#endif
+ if (of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 44a91ef5a23b..a7d562697e50 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -405,8 +405,8 @@ static void __die_header(const char *str, struct pt_regs *regs, long err)
pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
- pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter, pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index b66f540de054..68b09f718f10 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum
return __e820__range_update(e820_table, start, size, old_type, new_type);
}
-static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
+ enum e820_type old_type, enum e820_type new_type)
{
- return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
+ return __e820__range_update(t, start, size, old_type, new_type);
}
/* Remove a range of memory from the E820 table: */
@@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
addr = memblock_phys_alloc(size, align);
if (addr) {
- e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
e820__update_table_kexec();
}
@@ -1016,17 +1017,6 @@ void __init e820__reserve_setup_data(void)
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- /*
- * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by
- * kexec and do not need to be reserved.
- */
- if (data->type != SETUP_EFI &&
- data->type != SETUP_IMA &&
- data->type != SETUP_RNG_SEED)
- e820__range_update_kexec(pa_data,
- sizeof(*data) + data->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
-
if (data->type == SETUP_INDIRECT) {
len += data->len;
early_memunmap(data, sizeof(*data));
@@ -1038,12 +1028,9 @@ void __init e820__reserve_setup_data(void)
indirect = (struct setup_indirect *)data->data;
- if (indirect->type != SETUP_INDIRECT) {
+ if (indirect->type != SETUP_INDIRECT)
e820__range_update(indirect->addr, indirect->len,
E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- e820__range_update_kexec(indirect->addr, indirect->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- }
}
pa_data = pa_next;
@@ -1051,7 +1038,6 @@ void __init e820__reserve_setup_data(void)
}
e820__update_table(e820_table);
- e820__update_table(e820_table_kexec);
pr_info("extended physical RAM map:\n");
e820__print_table("reserve setup_data");
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index e963344b0449..53935b4d62e3 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -2,6 +2,7 @@
/*
* EISA specific code
*/
+#include <linux/cc_platform.h>
#include <linux/ioport.h>
#include <linux/eisa.h>
#include <linux/io.h>
@@ -12,7 +13,7 @@ static __init int eisa_bus_probe(void)
{
void __iomem *p;
- if (xen_pv_domain() && !xen_initial_domain())
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return 0;
p = ioremap(0x0FFFD9, 4);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 520deb411a70..1209c7aebb21 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -145,8 +145,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
asm volatile(
"fnclex\n\t"
"emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
}
if (use_xsave()) {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 117e74c44e75..6276329f5e66 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -178,10 +178,11 @@ void fpu__init_cpu_xstate(void)
* Must happen after CR4 setup and before xsetbv() to allow KVM
* lazy passthrough. Write independent of the dynamic state static
* key as that does not work on the boot CPU. This also ensures
- * that any stale state is wiped out from XFD.
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
*/
if (cpu_feature_enabled(X86_FEATURE_XFD))
- wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+ xfd_set_state(init_fpstate.xfd);
/*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
@@ -1433,8 +1434,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
return rstor;
/*
- * XSAVE(S): clone(), fpu_swap_kvm_fpu()
- * XRSTORS(S): fpu_swap_kvm_fpu()
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
*/
/*
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3518fb26d06b..19ca623ffa2a 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -148,20 +148,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#endif
#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
static inline void xfd_update_state(struct fpstate *fpstate)
{
if (fpu_state_size_dynamic()) {
u64 xfd = fpstate->xfd;
- if (__this_cpu_read(xfd_state) != xfd) {
- wrmsrl(MSR_IA32_XFD, xfd);
- __this_cpu_write(xfd_state, xfd);
- }
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
}
}
extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
+static inline void xfd_set_state(u64 xfd) { }
+
static inline void xfd_update_state(struct fpstate *fpstate) { }
static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 212e8e06aeba..a817ed0724d1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -81,6 +81,13 @@ static inline bool check_la57_support(void)
if (!(native_read_cr4() & X86_CR4_LA57))
return false;
+ RIP_REL_REF(__pgtable_l5_enabled) = 1;
+ RIP_REL_REF(pgdir_shift) = 48;
+ RIP_REL_REF(ptrs_per_p4d) = 512;
+ RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
+ RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
+ RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
+
return true;
}
@@ -175,7 +182,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
- pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC;
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
}
RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
@@ -431,15 +438,6 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
- if (check_la57_support()) {
- __pgtable_l5_enabled = 1;
- pgdir_shift = 48;
- ptrs_per_p4d = 512;
- page_offset_base = __PAGE_OFFSET_BASE_L5;
- vmalloc_base = __VMALLOC_BASE_L5;
- vmemmap_base = __VMEMMAP_BASE_L5;
- }
-
cr4_init_shadow();
/* Kill off the identity-map trampoline */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b50f3641c4d6..2e42056d2306 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -44,9 +44,6 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-
-#define SIZEOF_PTREGS 17*4
-
/*
* Worst-case size of the kernel mapping we need to make:
* a relocatable kernel can live anywhere in lowmem, so we need to be able
@@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table)
.data
.balign 4
-/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack,
- .long init_thread_union + THREAD_SIZE -
- SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8198fbd70e5..330922b328bf 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,7 +66,7 @@ SYM_CODE_START_NOALIGN(startup_64)
mov %rsi, %r15
/* Set up the stack for verify_cpu() */
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
@@ -720,7 +720,7 @@ SYM_DATA(smpboot_control, .long 0)
SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0cd53fa8c65d..f445bec516a0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_X86_LOCAL_APIC
INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
-# ifdef CONFIG_HAVE_KVM
+# if IS_ENABLED(CONFIG_KVM)
INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
# endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
#endif
};
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 11761c124545..385e3a5fc304 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,8 @@
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -164,7 +166,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
seq_printf(p, "%*s: ", prec, "PIN");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
@@ -182,6 +184,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
irq_stats(j)->kvm_posted_intr_wakeup_ipis);
seq_puts(p, " Posted-interrupt wakeup event\n");
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
+#endif
return 0;
}
@@ -240,24 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
__handle_irq(desc, regs);
}
-/*
- * common_interrupt() handles all normal device IRQ's (the special SMP
- * cross-CPU interrupts have their own entry points).
- */
-DEFINE_IDTENTRY_IRQ(common_interrupt)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
-
- /* entry code tells RCU that we're not quiescent. Check it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+ int ret = 0;
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs);
} else {
- apic_eoi();
-
+ ret = -EINVAL;
if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
@@ -267,6 +268,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
}
}
+ return ret;
+}
+
+/*
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
+ */
+DEFINE_IDTENTRY_IRQ(common_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* entry code tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ if (unlikely(call_irq_handler(vector, regs)))
+ apic_eoi();
+
set_irq_regs(old_regs);
}
@@ -290,7 +308,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
}
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
static void dummy_handler(void) {}
static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
@@ -334,12 +352,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
}
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+
+void intel_posted_msi_init(void)
+{
+ u32 destination;
+ u32 apic_id;
+
+ this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
+
+ /*
+ * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+ * VT-d spec. CH 9.11
+ */
+ apic_id = this_cpu_read(x86_cpu_to_apicid);
+ destination = x2apic_enabled() ? apic_id : apic_id << 8;
+ this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ * for checking and clearing posted interrupt request (PIR), a 256 bit field
+ * within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+ int i, vec = FIRST_EXTERNAL_VECTOR;
+ unsigned long pir_copy[4];
+ bool handled = false;
+
+ for (i = 0; i < 4; i++)
+ pir_copy[i] = pir[i];
+
+ for (i = 0; i < 4; i++) {
+ if (!pir_copy[i])
+ continue;
+
+ pir_copy[i] = arch_xchg(&pir[i], 0);
+ handled = true;
+ }
+
+ if (handled) {
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
+ }
+
+ return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+
+ pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
+ irq_enter();
+
+ /*
+ * Max coalescing count includes the extra round of handle_pending_pir
+ * after clearing the outstanding notification bit. Hence, at most
+ * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ */
+ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ if (!handle_pending_pir(pid->pir64, regs))
+ break;
+ }
+
+ /*
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
+
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ handle_pending_pir(pid->pir64, regs);
+
+ apic_eoi();
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+#endif /* X86_POSTED_MSI */
#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
- unsigned int irr, vector;
+ unsigned int vector;
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
@@ -366,8 +511,7 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1 << (vector % 32))) {
+ if (is_vector_pending(vector)) {
desc = __this_cpu_read(vector_irq[vector]);
raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index cde167b0ea92..68530fad05f7 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -263,11 +263,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
memset(&params->hd0_info, 0, sizeof(params->hd0_info));
memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+#ifdef CONFIG_CRASH_DUMP
if (image->type == KEXEC_TYPE_CRASH) {
ret = crash_setup_memmap_entries(image, params);
if (ret)
return ret;
} else
+#endif
setup_e820_entries(params);
nr_e820_entries = params->e820_entries;
@@ -433,12 +435,14 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ERR_PTR(-EINVAL);
}
+#ifdef CONFIG_CRASH_DUMP
/* Allocate and load backup region */
if (image->type == KEXEC_TYPE_CRASH) {
ret = crash_load_segments(image);
if (ret)
return ERR_PTR(ret);
}
+#endif
/*
* Load purgatory. For 64bit entry point, purgatory code can be
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index c993521d4933..e772276f5aa9 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -78,7 +78,7 @@
#endif
/* Ensure if the instruction can be boostable */
-extern int can_boost(struct insn *insn, void *orig_addr);
+extern bool can_boost(struct insn *insn, void *orig_addr);
/* Recover instruction if given address is probed */
extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
unsigned long addr);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a0ce46c0a2d8..d0e49bd7c6f3 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -137,14 +137,14 @@ NOKPROBE_SYMBOL(synthesize_relcall);
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int can_boost(struct insn *insn, void *addr)
+bool can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
insn_byte_t prefix;
int i;
if (search_exception_tables((unsigned long)addr))
- return 0; /* Page fault may occur on this address. */
+ return false; /* Page fault may occur on this address. */
/* 2nd-byte opcode */
if (insn->opcode.nbytes == 2)
@@ -152,7 +152,7 @@ int can_boost(struct insn *insn, void *addr)
(unsigned long *)twobyte_is_boostable);
if (insn->opcode.nbytes != 1)
- return 0;
+ return false;
for_each_insn_prefix(insn, i, prefix) {
insn_attr_t attr;
@@ -160,7 +160,7 @@ int can_boost(struct insn *insn, void *addr)
attr = inat_get_opcode_attribute(prefix);
/* Can't boost Address-size override prefix and CS override prefix */
if (prefix == 0x2e || inat_is_address_size_prefix(attr))
- return 0;
+ return false;
}
opcode = insn->opcode.bytes[0];
@@ -169,24 +169,35 @@ int can_boost(struct insn *insn, void *addr)
case 0x62: /* bound */
case 0x70 ... 0x7f: /* Conditional jumps */
case 0x9a: /* Call far */
- case 0xc0 ... 0xc1: /* Grp2 */
case 0xcc ... 0xce: /* software exceptions */
- case 0xd0 ... 0xd3: /* Grp2 */
case 0xd6: /* (UD) */
case 0xd8 ... 0xdf: /* ESC */
case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
case 0xe8 ... 0xe9: /* near Call, JMP */
case 0xeb: /* Short JMP */
case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ /* ... are not boostable */
+ return false;
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ /*
+ * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved.
+ */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110;
case 0xf6 ... 0xf7: /* Grp3 */
+ /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001;
case 0xfe: /* Grp4 */
- /* ... are not boostable */
- return 0;
+ /* Only INC and DEC are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001;
case 0xff: /* Grp5 */
- /* Only indirect jmp is boostable */
- return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
+ /* Only INC, DEC, and indirect JMP are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100;
default:
- return 1;
+ return true;
}
}
@@ -252,21 +263,40 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add
return __recover_probed_insn(buf, addr);
}
-/* Check if paddr is at an instruction boundary */
-static int can_probe(unsigned long paddr)
+/* Check if insn is INT or UD */
+static inline bool is_exception_insn(struct insn *insn)
+{
+ /* UD uses 0f escape */
+ if (insn->opcode.bytes[0] == 0x0f) {
+ /* UD0 / UD1 / UD2 */
+ return insn->opcode.bytes[1] == 0xff ||
+ insn->opcode.bytes[1] == 0xb9 ||
+ insn->opcode.bytes[1] == 0x0b;
+ }
+
+ /* INT3 / INT n / INTO / INT1 */
+ return insn->opcode.bytes[0] == 0xcc ||
+ insn->opcode.bytes[0] == 0xcd ||
+ insn->opcode.bytes[0] == 0xce ||
+ insn->opcode.bytes[0] == 0xf1;
+}
+
+/*
+ * Check if paddr is at an instruction boundary and that instruction can
+ * be probed
+ */
+static bool can_probe(unsigned long paddr)
{
unsigned long addr, __addr, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
- return 0;
+ return false;
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
- int ret;
-
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
@@ -277,11 +307,10 @@ static int can_probe(unsigned long paddr)
*/
__addr = recover_probed_instruction(buf, addr);
if (!__addr)
- return 0;
+ return false;
- ret = insn_decode_kernel(&insn, (void *)__addr);
- if (ret < 0)
- return 0;
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
#ifdef CONFIG_KGDB
/*
@@ -290,10 +319,26 @@ static int can_probe(unsigned long paddr)
*/
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
kgdb_has_hit_break(addr))
- return 0;
+ return false;
#endif
addr += insn.length;
}
+
+ /* Check if paddr is at an instruction boundary */
+ if (addr != paddr)
+ return false;
+
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+ /* INT and UD are special and should not be kprobed */
+ if (is_exception_insn(&insn))
+ return false;
+
if (IS_ENABLED(CONFIG_CFI_CLANG)) {
/*
* The compiler generates the following instruction sequence
@@ -308,13 +353,6 @@ static int can_probe(unsigned long paddr)
* Also, these movl and addl are used for showing expected
* type. So those must not be touched.
*/
- __addr = recover_probed_instruction(buf, addr);
- if (!__addr)
- return 0;
-
- if (insn_decode_kernel(&insn, (void *)__addr) < 0)
- return 0;
-
if (insn.opcode.value == 0xBA)
offset = 12;
else if (insn.opcode.value == 0x3)
@@ -324,18 +362,27 @@ static int can_probe(unsigned long paddr)
/* This movl/addl is used for decoding CFI. */
if (is_cfi_trap(addr + offset))
- return 0;
+ return false;
}
out:
- return (addr == paddr);
+ return true;
}
/* If x86 supports IBT (ENDBR) it must be skipped. */
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- if (is_endbr(*(u32 *)addr)) {
+ u32 insn;
+
+ /*
+ * Since 'addr' is not guaranteed to be safe to access, use
+ * copy_from_kernel_nofault() to read the instruction:
+ */
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
+ return NULL;
+
+ if (is_endbr(insn)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 101a7c1bf200..263f8aed4e2c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -44,7 +44,7 @@
#include <asm/svm.h>
#include <asm/e820/api.h>
-DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
static int kvmapf = 1;
@@ -65,6 +65,7 @@ static int __init parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
+static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
@@ -244,7 +245,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
{
u32 flags = 0;
- if (__this_cpu_read(apf_reason.enabled)) {
+ if (__this_cpu_read(async_pf_enabled)) {
flags = __this_cpu_read(apf_reason.flags);
__this_cpu_write(apf_reason.flags, 0);
}
@@ -295,7 +296,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
inc_irq_stat(irq_hv_callback_count);
- if (__this_cpu_read(apf_reason.enabled)) {
+ if (__this_cpu_read(async_pf_enabled)) {
token = __this_cpu_read(apf_reason.token);
kvm_async_pf_task_wake(token);
__this_cpu_write(apf_reason.token, 0);
@@ -362,7 +363,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
- __this_cpu_write(apf_reason.enabled, 1);
+ __this_cpu_write(async_pf_enabled, true);
pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
@@ -383,11 +384,11 @@ static void kvm_guest_cpu_init(void)
static void kvm_pv_disable_apf(void)
{
- if (!__this_cpu_read(apf_reason.enabled))
+ if (!__this_cpu_read(async_pf_enabled))
return;
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
- __this_cpu_write(apf_reason.enabled, 0);
+ __this_cpu_write(async_pf_enabled, false);
pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}
@@ -770,7 +771,7 @@ static struct notifier_block kvm_pv_reboot_nb = {
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written.
*/
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
static void kvm_crash_shutdown(struct pt_regs *regs)
{
kvm_guest_cpu_offline(true);
@@ -853,7 +854,7 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index bc0a5348b4a6..b180d8e497c3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -508,6 +508,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
}
#endif /* CONFIG_KEXEC_FILE */
+#ifdef CONFIG_CRASH_DUMP
+
static int
kexec_mark_range(unsigned long start, unsigned long end, bool protect)
{
@@ -552,6 +554,7 @@ void arch_kexec_unprotect_crashkres(void)
{
kexec_mark_crashkres(false);
}
+#endif
/*
* During a traditional boot under SME, SME will encrypt the kernel,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 1ccd30c8246f..e89171b0347a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -197,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
- /* Initialize the lapic mapping */
- if (!acpi_lapic)
- register_lapic_address(mpc->lapic);
-
- if (early)
+ if (early) {
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
return 1;
+ }
/* Now process the configuration blocks. */
while (count < mpc->length) {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 9a5b372c706f..ed163c8c8604 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -580,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
static char *nmi_check_stall_msg[] = {
/* */
-/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
+/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */
/* | +------ cpu_is_offline(cpu) */
/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
/* | | | NMI handler has been invoked. */
@@ -628,22 +628,26 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
msgp = "CPU entered NMI handler function, but has not exited";
- } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
- msgp = "CPU is handling NMIs";
- } else {
- idx = ((nsp->idt_seq_snap & 0x1) << 2) |
+ } else if (nsp->idt_nmi_seq_snap == nmi_seq ||
+ nsp->idt_nmi_seq_snap + 1 == nmi_seq) {
+ idx = ((nmi_seq & 0x1) << 2) |
(cpu_is_offline(cpu) << 1) |
(nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & 0x1)
- msghp = " (CPU currently in NMI handler function)";
- else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
+ else if (nmi_seq & 0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else
+ msghp = " (CPU was never in an NMI handler function)";
+ } else {
+ msgp = "CPU is handling NMIs";
}
- pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
- __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
+ pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp);
+ pr_alert("%s: last activity: %lu jiffies ago.\n",
+ __func__, j - READ_ONCE(nsp->recv_jiffies));
}
}
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9dc..cc2c34ba7228 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -203,16 +203,6 @@ void __init probe_roms(void)
unsigned char c;
int i;
- /*
- * The ROM memory range is not part of the e820 table and is therefore not
- * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
- * memory, and SNP requires encrypted memory to be validated before access.
- * Do that here.
- */
- snp_prep_memory(video_rom_resource.start,
- ((system_rom_resource.end + 1) - video_rom_resource.start),
- SNP_PAGE_STATE_PRIVATE);
-
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7062b84dd467..6d3d20e3e43a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -139,7 +139,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
log_lvl, d3, d6, d7);
}
- if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ if (cr4 & X86_CR4_PKE)
printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e..f3130f762784 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -796,7 +796,7 @@ struct machine_ops machine_ops __ro_after_init = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -826,7 +826,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 2e7066980f3e..51a849a79c98 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -10,7 +10,6 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/intel-mid.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 46d5a8c520ad..55a1fc332e20 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -7,9 +7,9 @@
*/
#include <linux/acpi.h>
#include <linux/console.h>
+#include <linux/cpu.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
-#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
@@ -36,6 +36,7 @@
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cacheinfo.h>
+#include <asm/coco.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
@@ -471,7 +472,7 @@ static void __init arch_reserve_crashkernel(void)
bool high = false;
int ret;
- if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
@@ -753,6 +754,22 @@ void __init setup_arch(char **cmdline_p)
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
/*
* If we have OLPC OFW, we might end up relocating the fixmap due to
* reserve_top(), so do this before touching the ioremap area.
@@ -832,22 +849,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -902,7 +903,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
reserve_ibft_region();
- dmi_setup();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
@@ -992,6 +993,7 @@ void __init setup_arch(char **cmdline_p)
* memory size.
*/
mem_encrypt_setup_arch();
+ cc_random_init();
efi_fake_memmap();
efi_find_mirror();
@@ -1206,16 +1208,6 @@ void __init i386_reserve_resources(void)
#endif /* CONFIG_X86_32 */
-#ifndef CONFIG_SMP
-void __init smp_prepare_boot_cpu(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- *c = boot_cpu_data;
- c->initialized = true;
-}
-#endif
-
static struct notifier_block kernel_offset_notifier = {
.notifier_call = dump_kernel_offset
};
@@ -1227,3 +1219,10 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return cpu > 0;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 8b04958da5e7..b4f8fa0f722c 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -1203,12 +1203,14 @@ static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
break;
case SVM_EXIT_MONITOR:
- if (opcode == 0x010f && modrm == 0xc8)
+ /* MONITOR and MONITORX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc8 || modrm == 0xfa))
return ES_OK;
break;
case SVM_EXIT_MWAIT:
- if (opcode == 0x010f && modrm == 0xc9)
+ /* MWAIT and MWAITX instructions generate the same error code */
+ if (opcode == 0x010f && (modrm == 0xc9 || modrm == 0xfb))
return ES_OK;
break;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index b59b09c2f284..3342ed58e168 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -23,6 +23,7 @@
#include <linux/platform_device.h>
#include <linux/io.h>
#include <linux/psp-sev.h>
+#include <linux/dmi.h>
#include <uapi/linux/sev-guest.h>
#include <asm/init.h>
@@ -647,7 +648,7 @@ static u64 __init get_secrets_page(void)
static u64 __init get_snp_jump_table_addr(void)
{
- struct snp_secrets_page_layout *layout;
+ struct snp_secrets_page *secrets;
void __iomem *mem;
u64 pa, addr;
@@ -661,9 +662,9 @@ static u64 __init get_snp_jump_table_addr(void)
return 0;
}
- layout = (__force struct snp_secrets_page_layout *)mem;
+ secrets = (__force struct snp_secrets_page *)mem;
- addr = layout->os_area.ap_jump_table_pa;
+ addr = secrets->os_area.ap_jump_table_pa;
iounmap(mem);
return addr;
@@ -795,21 +796,6 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
}
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
-{
- unsigned long vaddr, npages;
-
- vaddr = (unsigned long)__va(paddr);
- npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
-
- if (op == SNP_PAGE_STATE_PRIVATE)
- early_snp_set_memory_private(vaddr, paddr, npages);
- else if (op == SNP_PAGE_STATE_SHARED)
- early_snp_set_memory_shared(vaddr, paddr, npages);
- else
- WARN(1, "invalid memory op %d\n", op);
-}
-
static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
unsigned long vaddr_end, int op)
{
@@ -952,7 +938,7 @@ static int snp_set_vmsa(void *va, bool vmsa)
#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
-static void *snp_alloc_vmsa_page(void)
+static void *snp_alloc_vmsa_page(int cpu)
{
struct page *p;
@@ -964,7 +950,7 @@ static void *snp_alloc_vmsa_page(void)
*
* Allocate an 8k page which is also 8k-aligned.
*/
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ p = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
if (!p)
return NULL;
@@ -1033,7 +1019,7 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
* #VMEXIT of that vCPU would wipe out all of the settings being done
* here.
*/
- vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(cpu);
if (!vmsa)
return -ENOMEM;
@@ -1355,7 +1341,7 @@ static void __init alloc_runtime_data(int cpu)
{
struct sev_es_runtime_data *data;
- data = memblock_alloc(sizeof(*data), PAGE_SIZE);
+ data = memblock_alloc_node(sizeof(*data), PAGE_SIZE, cpu_to_node(cpu));
if (!data)
panic("Can't allocate SEV-ES runtime data");
@@ -2136,6 +2122,17 @@ void __head __noreturn snp_abort(void)
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
}
+/*
+ * SEV-SNP guests should only execute dmi_setup() if EFI_CONFIG_TABLES are
+ * enabled, as the alternative (fallback) logic for DMI probing in the legacy
+ * ROM region can cause a crash since this region is not pre-validated.
+ */
+void __init snp_dmi_setup(void)
+{
+ if (efi_enabled(EFI_CONFIG_TABLES))
+ dmi_setup();
+}
+
static void dump_cpuid_table(void)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -2287,16 +2284,6 @@ static int __init snp_init_platform_device(void)
}
device_initcall(snp_init_platform_device);
-void kdump_sev_callback(void)
-{
- /*
- * Do wbinvd() on remote CPUs when SNP is enabled in order to
- * safely do SNP_SHUTDOWN on the local CPU.
- */
- if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
- wbinvd();
-}
-
void sev_show_status(void)
{
int i;
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f8..6f1e9883f074 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -163,8 +163,8 @@ static int shstk_setup(void)
if (features_enabled(ARCH_SHSTK_SHSTK))
return 0;
- /* Also not supported for 32 bit and x32 */
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
return -EOPNOTSUPP;
size = adjust_shstk_size(0);
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index c12624bc82a3..ef654530bf5a 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -34,7 +34,7 @@
#include <asm/gsseg.h>
#ifdef CONFIG_IA32_EMULATION
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 23d8aaf8d9fd..8a94053c5444 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -315,6 +315,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -377,6 +380,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 2908e063d7d8..18266cc3d98c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -286,7 +286,7 @@ struct smp_ops smp_ops = {
.smp_cpus_done = native_smp_cpus_done,
.stop_other_cpus = native_stop_other_cpus,
-#if defined(CONFIG_KEXEC_CORE)
+#if defined(CONFIG_CRASH_DUMP)
.crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
#endif
.smp_send_reschedule = native_smp_send_reschedule,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe355c89f6c1..0c35207320cb 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -313,14 +313,6 @@ static void notrace start_secondary(void *unused)
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-static void __init smp_store_boot_cpu_info(void)
-{
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- *c = boot_cpu_data;
- c->initialized = true;
-}
-
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
@@ -446,9 +438,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static const struct x86_cpu_id intel_cod_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
{}
};
@@ -1039,36 +1031,24 @@ static __init void disable_smp(void)
cpumask_set_cpu(0, topology_die_cpumask(0));
}
-static void __init smp_cpu_index_default(void)
-{
- int i;
- struct cpuinfo_x86 *c;
-
- for_each_possible_cpu(i) {
- c = &cpu_data(i);
- /* mark all to hotplug */
- c->cpu_index = nr_cpu_ids;
- }
-}
-
void __init smp_prepare_cpus_common(void)
{
- unsigned int i;
+ unsigned int cpu, node;
- smp_cpu_index_default();
+ /* Mark all except the boot CPU as hotpluggable */
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
+ }
- /*
- * Setup boot CPU information
- */
- smp_store_boot_cpu_info(); /* Final full version of the data */
- mb();
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
set_cpu_sibling_map(0);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index d42c28b8bfd8..000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/interrupt.h>
-#include <linux/nodemask.h>
-#include <linux/export.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/cpu.h>
-
-#ifdef CONFIG_HOTPLUG_CPU
-bool arch_cpu_is_hotpluggable(int cpu)
-{
- return cpu > 0;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5a69a49acc96..06b170759e5b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -26,7 +26,7 @@
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
#include <asm/uv/uv.h>
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(tsc_khz);
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
-static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
int tsc_clocksource_reliable;
@@ -682,7 +682,7 @@ unsigned long native_calibrate_tsc(void)
* clock.
*/
if (crystal_khz == 0 &&
- boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
crystal_khz = 25000;
/*
@@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
* For Atom SoCs TSC is the only reliable clocksource.
* Mark TSC reliable so no watchdog on it.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 6555a857a1e6..deeb02825670 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = {
};
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1123ef3ccf90..4334033658ed 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;
/*
- * If a non-zero TSC value for socket 0 may be valid then the default
- * adjusted value cannot assumed to be zero either.
+ * The default adjust value cannot be assumed to be zero on any socket.
*/
- if (tsc_async_resets)
- cur->adjusted = bootval;
+ cur->adjusted = bootval;
/*
* Check whether this CPU is the first in a package to come up. In
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c
index 8a89c109e20a..5995a749288a 100644
--- a/arch/x86/kernel/crash_core_32.c
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/pgtable.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c
index 7d255f882afe..0dec7d868754 100644
--- a/arch/x86/kernel/crash_core_64.c
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/pgtable.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 56451fd2099e..3509afc6a672 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -15,11 +15,7 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
#define RUNTIME_DISCARD_EXIT
#define EMITS_PT_NOTE
@@ -114,11 +110,10 @@ PHDRS {
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
@@ -172,6 +167,9 @@ SECTIONS
/* init_task */
INIT_TASK_DATA(THREAD_SIZE)
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
#ifdef CONFIG_X86_32
/* 32 bit has nosave before _edata */
NOSAVE_DATA
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a42830dc151b..d5dc5a92635a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -3,6 +3,7 @@
*
* For licencing details see kernel-base/COPYING
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/export.h>
@@ -66,6 +67,7 @@ struct x86_init_ops x86_init __initdata = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
},
.mpparse = {
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65ed14b6540b..0ebdd088f28b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -7,7 +7,6 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on HAVE_KVM || X86
default y
help
Say Y here to get to see options for using your Linux host to run other
@@ -20,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
@@ -29,9 +27,9 @@ config KVM
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
- select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_READONLY_MEM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
@@ -120,10 +118,11 @@ config KVM_AMD
will be called kvm-amd.
config KVM_AMD_SEV
- def_bool y
bool "AMD Secure Encrypted Virtualization (SEV) support"
+ default y
depends on KVM_AMD && X86_64
depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
+ select ARCH_HAS_CC_PLATFORM
help
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 475b5fa917a6..addc44fc7187 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,6 @@
ccflags-y += -I $(srctree)/arch/x86/kvm
ccflags-$(CONFIG_KVM_WERROR) += -Werror
-ifeq ($(CONFIG_FRAME_POINTER),y)
-OBJECT_FILES_NON_STANDARD_vmenter.o := y
-endif
-
include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index adba49afb5fe..77352a4abd87 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -189,15 +189,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
return 0;
}
-static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
- const char *sig)
+static struct kvm_hypervisor_cpuid __kvm_get_hypervisor_cpuid(struct kvm_cpuid_entry2 *entries,
+ int nent, const char *sig)
{
struct kvm_hypervisor_cpuid cpuid = {};
struct kvm_cpuid_entry2 *entry;
u32 base;
for_each_possible_hypervisor_cpuid_base(base) {
- entry = kvm_find_cpuid_entry(vcpu, base);
+ entry = cpuid_entry2_find(entries, nent, base, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (entry) {
u32 signature[3];
@@ -217,22 +217,29 @@ static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcp
return cpuid;
}
-static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
- struct kvm_cpuid_entry2 *entries, int nent)
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
{
- u32 base = vcpu->arch.kvm_cpuid.base;
-
- if (!base)
- return NULL;
+ return __kvm_get_hypervisor_cpuid(vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent, sig);
+}
- return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES,
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_cpuid_entry2 *entries,
+ int nent, u32 kvm_cpuid_base)
+{
+ return cpuid_entry2_find(entries, nent, kvm_cpuid_base | KVM_CPUID_FEATURES,
KVM_CPUID_INDEX_NOT_SIGNIFICANT);
}
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
{
- return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent);
+ u32 base = vcpu->arch.kvm_cpuid.base;
+
+ if (!base)
+ return NULL;
+
+ return __kvm_find_kvm_cpuid_features(vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent, base);
}
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@@ -266,6 +273,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
int nent)
{
struct kvm_cpuid_entry2 *best;
+ struct kvm_hypervisor_cpuid kvm_cpuid;
best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
@@ -292,10 +300,12 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
- if (kvm_hlt_in_guest(vcpu->kvm) && best &&
- (best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
- best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+ kvm_cpuid = __kvm_get_hypervisor_cpuid(entries, nent, KVM_SIGNATURE);
+ if (kvm_cpuid.base) {
+ best = __kvm_find_kvm_cpuid_features(entries, nent, kvm_cpuid.base);
+ if (kvm_hlt_in_guest(vcpu->kvm) && best)
+ best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
+ }
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
best = cpuid_entry2_find(entries, nent, 0x1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
@@ -366,6 +376,7 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_update_pv_runtime(vcpu);
+ vcpu->arch.is_amd_compatible = guest_cpuid_is_amd_or_hygon(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 856e3037e74f..23dbb9eb277c 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -120,6 +120,16 @@ static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu)
return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx);
}
+static inline bool guest_cpuid_is_amd_compatible(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.is_amd_compatible;
+}
+
+static inline bool guest_cpuid_is_intel_compatible(struct kvm_vcpu *vcpu)
+{
+ return !guest_cpuid_is_amd_compatible(vcpu);
+}
+
static inline int guest_cpuid_family(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 95ea1a1f7403..999227fc7c66 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -189,9 +189,8 @@ static const struct file_operations mmu_rmaps_stat_fops = {
.release = kvm_mmu_rmaps_stat_release,
};
-int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
&mmu_rmaps_stat_fops);
- return 0;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e223043ef5b2..5d4c86133453 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1820,22 +1820,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
return X86EMUL_CONTINUE;
}
-static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len)
{
struct segmented_address addr;
- rsp_increment(ctxt, -bytes);
+ rsp_increment(ctxt, -len);
addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
addr.seg = VCPU_SREG_SS;
- return segmented_write(ctxt, addr, data, bytes);
+ return segmented_write(ctxt, addr, data, len);
}
static int em_push(struct x86_emulate_ctxt *ctxt)
{
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+ return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1863,7 +1863,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
void *dest, int len)
{
int rc;
- unsigned long val, change_mask;
+ unsigned long val = 0;
+ unsigned long change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
int cpl = ctxt->ops->cpl(ctxt);
@@ -1920,7 +1921,7 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
rbp = reg_read(ctxt, VCPU_REGS_RBP);
- rc = push(ctxt, &rbp, stack_size(ctxt));
+ rc = emulate_push(ctxt, &rbp, stack_size(ctxt));
if (rc != X86EMUL_CONTINUE)
return rc;
assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
@@ -1954,7 +1955,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
- unsigned long selector;
+ unsigned long selector = 0;
int rc;
rc = emulate_pop(ctxt, &selector, 2);
@@ -2000,7 +2001,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
- u32 val;
+ u32 val = 0;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
@@ -2229,7 +2230,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2241,7 +2242,8 @@ static int em_ret(struct x86_emulate_ctxt *ctxt)
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip, cs;
+ unsigned long eip = 0;
+ unsigned long cs = 0;
int cpl = ctxt->ops->cpl(ctxt);
struct desc_struct new_desc;
@@ -3011,7 +3013,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
- ops->get_dr(ctxt, 7, &dr7);
+ dr7 = ops->get_dr(ctxt, 7);
ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
return ret;
@@ -3184,7 +3186,7 @@ fail:
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -3866,15 +3868,6 @@ static int check_cr_access(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long dr7;
-
- ctxt->ops->get_dr(ctxt, 7, &dr7);
-
- return dr7 & DR7_GD;
-}
-
static int check_dr_read(struct x86_emulate_ctxt *ctxt)
{
int dr = ctxt->modrm_reg;
@@ -3887,10 +3880,10 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (check_dr7_gd(ctxt)) {
+ if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
ulong dr6;
- ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 = ctxt->ops->get_dr(ctxt, 6);
dr6 &= ~DR_TRAP_BITS;
dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
@@ -3962,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
* protected mode.
*/
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- ctxt->ops->check_pmc(ctxt, rcx))
+ ctxt->ops->check_rdpmc_early(ctxt, rcx))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -4505,11 +4498,11 @@ static const struct instr_dual instr_dual_0f_38_f1 = {
};
static const struct gprefix three_byte_0f_38_f0 = {
- ID(0, &instr_dual_0f_38_f0), N, N, N
+ ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N
};
static const struct gprefix three_byte_0f_38_f1 = {
- ID(0, &instr_dual_0f_38_f1), N, N, N
+ ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N
};
/*
@@ -5449,7 +5442,7 @@ twobyte_insn:
ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg);
break;
case 0x40 ... 0x4f: /* cmov */
if (test_cc(ctxt->b, ctxt->eflags))
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index e6d149825169..5382646162a3 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -203,12 +203,12 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
- int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1edf93ee3395..ebf41023be38 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -124,6 +125,9 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
+
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
@@ -499,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -2466,8 +2472,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!vcpu->arch.apic)
+ if (!vcpu->arch.apic) {
+ static_branch_dec(&kvm_has_noapic_vcpu);
return;
+ }
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2768,7 +2776,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL);
- if (r && lvt_type == APIC_LVTPC)
+ if (r && lvt_type == APIC_LVTPC &&
+ guest_cpuid_is_intel_compatible(apic->vcpu))
kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED);
return r;
}
@@ -2809,6 +2818,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
ASSERT(vcpu != NULL);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ static_branch_inc(&kvm_has_noapic_vcpu);
+ return 0;
+ }
+
apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
@@ -2847,6 +2861,21 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as this vCPU
+ * will not get notified of any changes until this vCPU is visible to
+ * other vCPUs (marked online and added to the set of vCPUs).
+ *
+ * Opportunistically mark APICv active as VMX in particularly is highly
+ * unlikely to have inhibits. Ignore the current per-VM APICv state so
+ * that vCPU creation is guaranteed to run with a deterministic value,
+ * the request will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
return 0;
nomem_free_apic:
kfree(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2b515acd8e72..db007a4dffa2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3110,7 +3110,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
/*
* Read each entry once. As above, a non-leaf entry can be promoted to
* a huge page _during_ this walk. Re-reading the entry could send the
- * walk into the weeks, e.g. p*d_large() returns false (sees the old
+ * walk into the weeks, e.g. p*d_leaf() returns false (sees the old
* value) and then p*d_offset() walks into the target huge page instead
* of the old page table (sees the new value).
*/
@@ -3126,7 +3126,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
if (pud_none(pud) || !pud_present(pud))
goto out;
- if (pud_large(pud)) {
+ if (pud_leaf(pud)) {
level = PG_LEVEL_1G;
goto out;
}
@@ -3135,7 +3135,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
if (pmd_none(pmd) || !pmd_present(pmd))
goto out;
- if (pmd_large(pmd))
+ if (pmd_leaf(pmd))
level = PG_LEVEL_2M;
out:
@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (WARN_ON_ONCE(!sp))
return;
- if (is_tdp_mmu_page(sp))
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
kvm_tdp_mmu_put_root(kvm, sp);
- else if (!--sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
*root_hpa = INVALID_PAGE;
}
@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
int i;
LIST_HEAD(invalid_list);
bool free_active_root;
@@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
return;
}
- write_lock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
mmu->root.pgd = 0;
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_alloc_root(vcpu);
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
goto out_unlock;
- if (tdp_mmu_enabled) {
- root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root.hpa = root;
- } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
@@ -4922,7 +4935,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
context->cpu_role.base.level, is_efer_nx(context),
guest_can_use(vcpu, X86_FEATURE_GBPAGES),
is_cr4_pse(context),
- guest_cpuid_is_amd_or_hygon(vcpu));
+ guest_cpuid_is_amd_compatible(vcpu));
}
static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
@@ -5563,9 +5576,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* that problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
- vcpu->arch.root_mmu.root_role.word = 0;
- vcpu->arch.guest_mmu.root_role.word = 0;
- vcpu->arch.nested_mmu.root_role.word = 0;
+ vcpu->arch.root_mmu.root_role.invalid = 1;
+ vcpu->arch.guest_mmu.root_role.invalid = 1;
+ vcpu->arch.nested_mmu.root_role.invalid = 1;
vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
@@ -7039,9 +7052,7 @@ int kvm_mmu_vendor_module_init(void)
kvm_mmu_reset_all_pte_masks();
- pte_list_desc_cache = kmem_cache_create("pte_list_desc",
- sizeof(struct pte_list_desc),
- 0, SLAB_ACCOUNT, NULL);
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
if (!pte_list_desc_cache)
goto out;
@@ -7388,7 +7399,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
* by the memslot, KVM can't use a hugepage due to the
* misaligned address regardless of memory attributes.
*/
- if (gfn >= slot->base_gfn) {
+ if (gfn >= slot->base_gfn &&
+ gfn + nr_pages <= slot->base_gfn + slot->npages) {
if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
hugepage_clear_mixed(slot, gfn, level);
else
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index c87da11f3a04..f6448284c18e 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -20,10 +20,23 @@
#include "mmu_internal.h"
#include "page_track.h"
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
- return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
- !tdp_enabled || kvm_shadow_root_allocated(kvm);
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
}
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
return init_srcu_struct(&head->track_srcu);
}
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
/*
* register the notifier so that event interception for the tracked guest
* pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ int r;
if (!kvm || kvm->mm != current->mm)
return -ESRCH;
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
kvm_get_kvm(kvm);
head = &kvm->arch.track_notifier_head;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..04c1f0957fea 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
- ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
@@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
- if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
- kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ ((_only_valid) && (_root)->role.invalid))) { \
} else
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
/*
- * Check for an existing root before allocating a new one. Note, the
- * role check prevents consuming an invalid root.
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
*/
- for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(root))
- goto out;
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
}
root = tdp_mmu_alloc_sp(vcpu);
@@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
-
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-out:
- return __pa(root->spt);
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ return 0;
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
/*
- * To avoid RCU stalls due to recursively removing huge swaths of SPs,
- * split the zap into two passes. On the first pass, zap at the 1gb
- * level, and then zap top-level SPs on the second pass. "1gb" is not
- * arbitrary, as KVM must be able to zap a 1gb shadow page without
- * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
*
- * Because zapping a SP recurses on its children, stepping down to
- * PG_LEVEL_4K in the iterator itself is unnecessary.
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
*/
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
@@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_iter_set_spte(kvm, &iter, 0);
- flush = true;
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
}
rcu_read_unlock();
@@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
- * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
- * more SPTEs were zapped since the MMU lock was last acquired.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
*/
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the VM is being destroyed).
*
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
- * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ * See kvm_tdp_mmu_alloc_root().
*/
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
@@ -1498,17 +1548,21 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
}
}
-/*
- * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
- * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
- * If AD bits are not enabled, this will require clearing the writable bit on
- * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
- * be flushed.
- */
+static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
+{
+ /*
+ * All TDP MMU shadow pages share the same role as their root, aside
+ * from level, so it is valid to key off any shadow page to determine if
+ * write protection is needed for an entire tree.
+ */
+ return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled();
+}
+
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
- u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
+ const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
bool spte_set = false;
@@ -1523,7 +1577,7 @@ retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (!(iter.old_spte & dbit))
@@ -1540,11 +1594,9 @@ retry:
}
/*
- * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
- * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
- * If AD bits are not enabled, this will require clearing the writable bit on
- * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
- * be flushed.
+ * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
+ * memslot. Returns true if an SPTE has been changed and the TLBs need to be
+ * flushed.
*/
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot)
@@ -1560,18 +1612,11 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
return spte_set;
}
-/*
- * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
- * set in mask, starting at gfn. The given memslot is expected to contain all
- * the GFNs represented by set bits in the mask. If AD bits are enabled,
- * clearing the dirty status will involve clearing the dirty bit on each SPTE
- * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
- */
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
- u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
- shadow_dirty_mask;
+ const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -1583,7 +1628,7 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
- KVM_MMU_WARN_ON(kvm_ad_enabled() &&
+ KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
spte_ad_need_write_protect(iter.old_spte));
if (iter.level > PG_LEVEL_4K ||
@@ -1609,11 +1654,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
- * set in mask, starting at gfn. The given memslot is expected to contain all
- * the GFNs represented by set bits in the mask. If AD bits are enabled,
- * clearing the dirty status will involve clearing the dirty bit on each SPTE
- * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
+ * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
+ * which a bit is set in mask, starting at gfn. The given memslot is expected to
+ * contain all the GFNs represented by set bits in the mask.
*/
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
@@ -1622,7 +1665,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
@@ -1740,7 +1783,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..6e1ea04ca885 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 87cc6c8809ad..a593b03c9aed 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -29,6 +29,9 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
+struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
+EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
+
/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
@@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
* * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
- * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
*/
@@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
int idx)
{
- int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+ int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
if (filter->action == KVM_PMU_EVENT_DENY &&
test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
@@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
{
return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
- static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
check_pmu_event_filter(pmc);
}
-static void reprogram_counter(struct kvm_pmc *pmc)
+static int reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
@@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
emulate_overflow = pmc_pause_counter(pmc);
if (!pmc_event_is_allowed(pmc))
- goto reprogram_complete;
+ return 0;
if (emulate_overflow)
__kvm_perf_overflow(pmc, false);
@@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED);
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
if (fixed_ctr_ctrl & 0x1)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
if (fixed_ctr_ctrl & 0x2)
@@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc)
}
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
- goto reprogram_complete;
+ return 0;
pmc_release_perf_event(pmc);
pmc->current_config = new_config;
- /*
- * If reprogramming fails, e.g. due to contention, leave the counter's
- * regprogram bit set, i.e. opportunistically try again on the next PMU
- * refresh. Don't make a new request as doing so can stall the guest
- * if reprogramming repeatedly fails.
- */
- if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
- (eventsel & pmu->raw_event_mask),
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT))
- return;
-
-reprogram_complete:
- clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
}
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
int bit;
- for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
+ bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
- if (unlikely(!pmc)) {
- clear_bit(bit, pmu->reprogram_pmi);
- continue;
- }
+ /*
+ * The reprogramming bitmap can be written asynchronously by something
+ * other than the task that holds vcpu->mutex, take care to clear only
+ * the bits that will actually processed.
+ */
+ BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
+ atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
- reprogram_counter(pmc);
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
+ /*
+ * If reprogramming fails, e.g. due to contention, re-set the
+ * regprogram bit set, i.e. opportunistically try again on the
+ * next PMU refresh. Don't make a new request as doing so can
+ * stall the guest if reprogramming repeatedly fails.
+ */
+ if (reprogram_counter(pmc))
+ set_bit(pmc->idx, pmu->reprogram_pmi);
}
/*
@@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
kvm_pmu_cleanup(vcpu);
}
-/* check if idx is a valid index to access PMU */
-bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
+ /*
+ * On Intel, VMX interception has priority over RDPMC exceptions that
+ * aren't already handled by the emulator, i.e. there are no additional
+ * check needed for Intel PMUs.
+ *
+ * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
+ * i.e. an invalid PMC results in a #GP, not #VMEXIT.
+ */
+ if (!kvm_pmu_ops.check_rdpmc_early)
+ return 0;
+
+ return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
- bool fast_mode = idx & (1u << 31);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
- u64 mask = fast_mode ? ~0u : ~0ull;
+ u64 mask = ~0ull;
if (!pmu->version)
return 1;
@@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
- for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
- if (!pmc)
- continue;
-
+ kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
pmc_stop_counter(pmc);
pmc->counter = 0;
pmc->emulated_counter = 0;
@@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
return;
@@ -750,8 +761,34 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
*/
kvm_pmu_reset(vcpu);
- bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ pmu->version = 0;
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_status_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
+ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+
+ if (!vcpu->kvm->arch.enable_pmu)
+ return;
+
static_call(kvm_x86_pmu_refresh)(vcpu);
+
+ /*
+ * At RESET, both Intel and AMD CPUs set all enable bits for general
+ * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
+ * was written for v1 PMUs don't unknowingly leave GP counters disabled
+ * in the global controls). Emulate that behavior when refreshing the
+ * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
+ */
+ if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
+ pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -776,10 +813,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
pmu->pmc_in_use, X86_PMC_IDX_MAX);
- for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
-
- if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
+ kvm_for_each_pmc(pmu, pmc, i, bitmask) {
+ if (pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}
@@ -799,13 +834,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
kvm_pmu_request_counter_reprogram(pmc);
}
-static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
- unsigned int perf_hw_id)
-{
- return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
- AMD64_RAW_EVENT_MASK_NB);
-}
-
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
@@ -817,29 +845,56 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED);
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
select_os = config & 0x1;
select_user = config & 0x2;
}
+ /*
+ * Skip the CPL lookup, which isn't free on Intel, if the result will
+ * be the same regardless of the CPL.
+ */
+ if (select_os == select_user)
+ return select_os;
+
return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
}
-void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int i;
- for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+ BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
- if (!pmc || !pmc_event_is_allowed(pmc))
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
+ (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
+ return;
+
+ kvm_for_each_pmc(pmu, pmc, i, bitmap) {
+ /*
+ * Ignore checks for edge detect (all events currently emulated
+ * but KVM are always rising edges), pin control (unsupported
+ * by modern CPUs), and counter mask and its invert flag (KVM
+ * doesn't emulate multiple events in a single clock cycle).
+ *
+ * Note, the uppermost nibble of AMD's mask overlaps Intel's
+ * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
+ * bits (bits 35:34). Checking the "in HLE/RTM transaction"
+ * flags is correct as the vCPU can't be in a transaction if
+ * KVM is emulating an instruction. Checking the reserved bits
+ * might be wrong if they are defined in the future, but so
+ * could ignoring them, so do the simple thing for now.
+ */
+ if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
+ !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
continue;
- /* Ignore checks for edge detect, pin control, invert and CMASK bits */
- if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
- kvm_pmu_incr_counter(pmc);
+ kvm_pmu_incr_counter(pmc);
}
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7caeb3d8d4fd..4d52b0b539ba 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -4,6 +4,8 @@
#include <linux/nospec.h>
+#include <asm/kvm_host.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -18,13 +20,18 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED
+
+struct kvm_pmu_emulated_event_selectors {
+ u64 INSTRUCTIONS_RETIRED;
+ u64 BRANCH_INSTRUCTIONS_RETIRED;
+};
+
struct kvm_pmu_ops {
- bool (*hw_event_available)(struct kvm_pmc *pmc);
- struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
- bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
+ int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -55,6 +62,38 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
return pmu->version > 1;
}
+/*
+ * KVM tracks all counters in 64-bit bitmaps, with general purpose counters
+ * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0
+ * is tracked internally via index 32. On Intel, (AMD doesn't support fixed
+ * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL
+ * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the
+ * amounter of boilerplate needed to iterate over PMCs *and* simplifies common
+ * enabling/disable/reset operations.
+ *
+ * WARNING! This helper is only for lookups that are initiated by KVM, it is
+ * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw
+ * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX
+ * for RDPMC, not by adding 32 to the fixed counter index).
+ */
+static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+ if (idx < pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[idx];
+
+ idx -= KVM_FIXED_PMC_BASE_IDX;
+ if (idx >= 0 && idx < pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[idx];
+
+ return NULL;
+}
+
+#define kvm_for_each_pmc(pmu, pmc, i, bitmap) \
+ for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \
+ if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \
+ continue; \
+ else \
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -131,12 +170,13 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc))
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3;
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
}
extern struct x86_pmu_capability kvm_pmu_cap;
+extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel;
static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
@@ -178,6 +218,11 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
KVM_PMC_MAX_FIXED);
+
+ kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
}
static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
@@ -216,7 +261,7 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -225,7 +270,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
-void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index aadefcaa9561..2f4e155080ba 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -52,7 +52,7 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_IPRED_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 1)
#define KVM_X86_FEATURE_RRSBA_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 2)
#define X86_FEATURE_DDPD_U KVM_X86_FEATURE(CPUID_7_2_EDX, 3)
-#define X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
+#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
/* CPUID level 0x80000007 (EDX). */
@@ -102,10 +102,12 @@ static const struct cpuid_reg reverse_cpuid[] = {
*/
static __always_inline void reverse_cpuid_check(unsigned int x86_leaf)
{
+ BUILD_BUG_ON(NR_CPUID_WORDS != NCAPINTS);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_5);
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
}
@@ -126,6 +128,7 @@ static __always_inline u32 __feature_translate(int x86_feature)
KVM_X86_TRANSLATE_FEATURE(CONSTANT_TSC);
KVM_X86_TRANSLATE_FEATURE(PERFMON_V2);
KVM_X86_TRANSLATE_FEATURE(RRSBA_CTRL);
+ KVM_X86_TRANSLATE_FEATURE(BHI_CTRL);
default:
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index dc3d95fdca7d..d06d43d8d2aa 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -184,7 +184,6 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
struct kvm_smram_state_32 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
smram->cr0 = kvm_read_cr0(vcpu);
@@ -195,10 +194,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
for (i = 0; i < 8; i++)
smram->gprs[i] = kvm_register_read_raw(vcpu, i);
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = (u32)val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = (u32)val;
+ smram->dr6 = (u32)vcpu->arch.dr6;
+ smram->dr7 = (u32)vcpu->arch.dr7;
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
@@ -231,7 +228,6 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
struct kvm_smram_state_64 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
for (i = 0; i < 16; i++)
@@ -240,11 +236,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
smram->rip = kvm_rip_read(vcpu);
smram->rflags = kvm_get_rflags(vcpu);
-
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = val;
+ smram->dr6 = vcpu->arch.dr6;
+ smram->dr7 = vcpu->arch.dr7;
smram->cr0 = kvm_read_cr0(vcpu);
smram->cr3 = kvm_read_cr3(vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index b6a7ad4d6914..dfcc38bd97d3 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -25,7 +25,7 @@ enum pmu_type {
PMU_TYPE_EVNTSEL,
};
-static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx)
{
unsigned int num_counters = pmu->nr_arch_gp_counters;
@@ -70,28 +70,24 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return NULL;
}
- return amd_pmc_idx_to_pmc(pmu, idx);
+ return amd_pmu_get_pmc(pmu, idx);
}
-static bool amd_hw_event_available(struct kvm_pmc *pmc)
-{
- return true;
-}
-
-static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return -EINVAL;
- return idx < pmu->nr_arch_gp_counters;
+ return 0;
}
/* idx is the ECX register of RDPMC instruction */
static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask)
{
- return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
+ return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx);
}
static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
@@ -233,11 +229,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops amd_pmu_ops __initdata = {
- .hw_event_available = amd_hw_event_available,
- .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
- .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
+ .check_rdpmc_early = amd_check_rdpmc_early,
.is_valid_msr = amd_is_valid_msr,
.get_msr = amd_pmu_get_msr,
.set_msr = amd_pmu_set_msr,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index ae0ac12382b9..759581bb2128 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -84,9 +84,10 @@ struct enc_region {
};
/* Called with the sev_bitmap_lock held, or on shutdown */
-static int sev_flush_asids(int min_asid, int max_asid)
+static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
{
- int ret, asid, error = 0;
+ int ret, error = 0;
+ unsigned int asid;
/* Check if there are any ASIDs to reclaim before performing a flush */
asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid);
@@ -116,7 +117,7 @@ static inline bool is_mirroring_enc_context(struct kvm *kvm)
}
/* Must be called with the sev_bitmap_lock held */
-static bool __sev_recycle_asids(int min_asid, int max_asid)
+static bool __sev_recycle_asids(unsigned int min_asid, unsigned int max_asid)
{
if (sev_flush_asids(min_asid, max_asid))
return false;
@@ -143,8 +144,20 @@ static void sev_misc_cg_uncharge(struct kvm_sev_info *sev)
static int sev_asid_new(struct kvm_sev_info *sev)
{
- int asid, min_asid, max_asid, ret;
+ /*
+ * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
+ * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
+ * Note: min ASID can end up larger than the max if basic SEV support is
+ * effectively disabled by disallowing use of ASIDs for SEV guests.
+ */
+ unsigned int min_asid = sev->es_active ? 1 : min_sev_asid;
+ unsigned int max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
+ unsigned int asid;
bool retry = true;
+ int ret;
+
+ if (min_asid > max_asid)
+ return -ENOTTY;
WARN_ON(sev->misc_cg);
sev->misc_cg = get_current_misc_cg();
@@ -157,12 +170,6 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_lock(&sev_bitmap_lock);
- /*
- * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid.
- * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1.
- */
- min_asid = sev->es_active ? 1 : min_sev_asid;
- max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid;
again:
asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid);
if (asid > max_asid) {
@@ -179,7 +186,8 @@ again:
mutex_unlock(&sev_bitmap_lock);
- return asid;
+ sev->asid = asid;
+ return 0;
e_uncharge:
sev_misc_cg_uncharge(sev);
put_misc_cg(sev->misc_cg);
@@ -187,7 +195,7 @@ e_uncharge:
return ret;
}
-static int sev_get_asid(struct kvm *kvm)
+static unsigned int sev_get_asid(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -247,21 +255,19 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_platform_init_args init_args = {0};
- int asid, ret;
+ int ret;
if (kvm->created_vcpus)
return -EINVAL;
- ret = -EBUSY;
if (unlikely(sev->active))
- return ret;
+ return -EINVAL;
sev->active = true;
sev->es_active = argp->id == KVM_SEV_ES_INIT;
- asid = sev_asid_new(sev);
- if (asid < 0)
+ ret = sev_asid_new(sev);
+ if (ret)
goto e_no_asid;
- sev->asid = asid;
init_args.probe = false;
ret = sev_platform_init(&init_args);
@@ -287,8 +293,8 @@ e_no_asid:
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
+ unsigned int asid = sev_get_asid(kvm);
struct sev_data_activate activate;
- int asid = sev_get_asid(kvm);
int ret;
/* activate ASID on the given handle */
@@ -428,7 +434,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
- pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT);
else
pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
@@ -2240,8 +2246,10 @@ void __init sev_hardware_setup(void)
goto out;
}
- sev_asid_count = max_sev_asid - min_sev_asid + 1;
- WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ if (min_sev_asid <= max_sev_asid) {
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count));
+ }
sev_supported = true;
/* SEV-ES support requested? */
@@ -2272,7 +2280,9 @@ void __init sev_hardware_setup(void)
out:
if (boot_cpu_has(X86_FEATURE_SEV))
pr_info("SEV %s (ASIDs %u - %u)\n",
- sev_supported ? "enabled" : "disabled",
+ sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
+ "unusable" :
+ "disabled",
min_sev_asid, max_sev_asid);
if (boot_cpu_has(X86_FEATURE_SEV_ES))
pr_info("SEV-ES %s (ASIDs %u - %u)\n",
@@ -2320,7 +2330,7 @@ int sev_cpu_init(struct svm_cpu_data *sd)
*/
static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
- int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+ unsigned int asid = sev_get_asid(vcpu->kvm);
/*
* Note! The address must be a kernel address, as regular page walk
@@ -2638,7 +2648,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
- int asid = sev_get_asid(svm->vcpu.kvm);
+ unsigned int asid = sev_get_asid(svm->vcpu.kvm);
/* Assign the asid allocated with this SEV guest */
svm->asid = asid;
@@ -3174,7 +3184,7 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
unsigned long pfn;
struct page *p;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
/*
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 272d5ed37ce7..9aaf83c8d57d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1503,6 +1503,11 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
+static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
+{
+ return page_address(sd->save_area) + 0x400;
+}
+
static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1519,12 +1524,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* or subsequent vmload of host save area.
*/
vmsave(sd->save_area_pa);
- if (sev_es_guest(vcpu->kvm)) {
- struct sev_es_save_area *hostsa;
- hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
-
- sev_es_prepare_switch_to_guest(hostsa);
- }
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd));
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
@@ -2735,7 +2736,6 @@ static int dr_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
- unsigned long val;
int err = 0;
/*
@@ -2763,11 +2763,9 @@ static int dr_interception(struct kvm_vcpu *vcpu)
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(vcpu, reg);
- err = kvm_set_dr(vcpu, dr, val);
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
} else {
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
}
return kvm_complete_insn_gp(vcpu, err);
@@ -4092,6 +4090,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
@@ -4101,6 +4102,7 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted)
{
+ struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu);
struct vcpu_svm *svm = to_svm(vcpu);
guest_state_enter_irqoff();
@@ -4108,19 +4110,21 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
amd_clear_divider();
if (sev_es_guest(vcpu->kvm))
- __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted);
+ __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted,
+ sev_es_host_save_area(sd));
else
__svm_vcpu_run(svm, spec_ctrl_intercepted);
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4139,9 +4143,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is enough to force an immediate vmexit.
*/
disable_nmi_singlestep(svm);
- smp_send_reschedule(vcpu->cpu);
+ force_immediate_exit = true;
}
+ if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -4237,9 +4244,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
return svm_exit_handlers_fastpath(vcpu);
}
@@ -5007,8 +5011,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .request_immediate_exit = __kvm_request_immediate_exit,
-
.sched_in = svm_sched_in,
.nested_ops = &svm_nested_ops,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 7f1fbd874c45..33878efdebc8 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -698,7 +698,8 @@ struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
/* vmenter.S */
-void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
+void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted,
+ struct sev_es_save_area *hostsa);
void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted);
#define DEFINE_KVM_GHCB_ACCESSORS(field) \
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 187018c424bf..a0c8eb37d3e1 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -3,6 +3,7 @@
#include <asm/asm.h>
#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
+#include <asm/frame.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include "kvm-asm-offsets.h"
@@ -67,7 +68,7 @@
"", X86_FEATURE_V_SPEC_CTRL
901:
.endm
-.macro RESTORE_HOST_SPEC_CTRL_BODY
+.macro RESTORE_HOST_SPEC_CTRL_BODY spec_ctrl_intercepted:req
900:
/* Same for after vmexit. */
mov $MSR_IA32_SPEC_CTRL, %ecx
@@ -76,7 +77,7 @@
* Load the value that the guest had written into MSR_IA32_SPEC_CTRL,
* if it was not intercepted during guest execution.
*/
- cmpb $0, (%_ASM_SP)
+ cmpb $0, \spec_ctrl_intercepted
jnz 998f
rdmsr
movl %eax, SVM_spec_ctrl(%_ASM_DI)
@@ -99,6 +100,7 @@
*/
SYM_FUNC_START(__svm_vcpu_run)
push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
#ifdef CONFIG_X86_64
push %r15
push %r14
@@ -268,7 +270,7 @@ SYM_FUNC_START(__svm_vcpu_run)
RET
RESTORE_GUEST_SPEC_CTRL_BODY
- RESTORE_HOST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY (%_ASM_SP)
10: cmpb $0, _ASM_RIP(kvm_rebooting)
jne 2b
@@ -290,66 +292,68 @@ SYM_FUNC_START(__svm_vcpu_run)
SYM_FUNC_END(__svm_vcpu_run)
+#ifdef CONFIG_KVM_AMD_SEV
+
+
+#ifdef CONFIG_X86_64
+#define SEV_ES_GPRS_BASE 0x300
+#define SEV_ES_RBX (SEV_ES_GPRS_BASE + __VCPU_REGS_RBX * WORD_SIZE)
+#define SEV_ES_RBP (SEV_ES_GPRS_BASE + __VCPU_REGS_RBP * WORD_SIZE)
+#define SEV_ES_RSI (SEV_ES_GPRS_BASE + __VCPU_REGS_RSI * WORD_SIZE)
+#define SEV_ES_RDI (SEV_ES_GPRS_BASE + __VCPU_REGS_RDI * WORD_SIZE)
+#define SEV_ES_R12 (SEV_ES_GPRS_BASE + __VCPU_REGS_R12 * WORD_SIZE)
+#define SEV_ES_R13 (SEV_ES_GPRS_BASE + __VCPU_REGS_R13 * WORD_SIZE)
+#define SEV_ES_R14 (SEV_ES_GPRS_BASE + __VCPU_REGS_R14 * WORD_SIZE)
+#define SEV_ES_R15 (SEV_ES_GPRS_BASE + __VCPU_REGS_R15 * WORD_SIZE)
+#endif
+
/**
* __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode
* @svm: struct vcpu_svm *
* @spec_ctrl_intercepted: bool
*/
SYM_FUNC_START(__svm_sev_es_vcpu_run)
- push %_ASM_BP
-#ifdef CONFIG_X86_64
- push %r15
- push %r14
- push %r13
- push %r12
-#else
- push %edi
- push %esi
-#endif
- push %_ASM_BX
+ FRAME_BEGIN
/*
- * Save variables needed after vmexit on the stack, in inverse
- * order compared to when they are needed.
+ * Save non-volatile (callee-saved) registers to the host save area.
+ * Except for RAX and RSP, all GPRs are restored on #VMEXIT, but not
+ * saved on VMRUN.
*/
+ mov %rbp, SEV_ES_RBP (%rdx)
+ mov %r15, SEV_ES_R15 (%rdx)
+ mov %r14, SEV_ES_R14 (%rdx)
+ mov %r13, SEV_ES_R13 (%rdx)
+ mov %r12, SEV_ES_R12 (%rdx)
+ mov %rbx, SEV_ES_RBX (%rdx)
- /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */
- push %_ASM_ARG2
-
- /* Save @svm. */
- push %_ASM_ARG1
-
-.ifnc _ASM_ARG1, _ASM_DI
/*
- * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX
- * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL.
+ * Save volatile registers that hold arguments that are needed after
+ * #VMEXIT (RDI=@svm and RSI=@spec_ctrl_intercepted).
*/
- mov %_ASM_ARG1, %_ASM_DI
-.endif
+ mov %rdi, SEV_ES_RDI (%rdx)
+ mov %rsi, SEV_ES_RSI (%rdx)
- /* Clobbers RAX, RCX, RDX. */
+ /* Clobbers RAX, RCX, RDX (@hostsa). */
RESTORE_GUEST_SPEC_CTRL
/* Get svm->current_vmcb->pa into RAX. */
- mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX
- mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX
+ mov SVM_current_vmcb(%rdi), %rax
+ mov KVM_VMCB_pa(%rax), %rax
/* Enter guest mode */
sti
-1: vmrun %_ASM_AX
+1: vmrun %rax
2: cli
- /* Pop @svm to RDI, guest registers have been saved already. */
- pop %_ASM_DI
-
#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
- /* Clobbers RAX, RCX, RDX. */
+ /* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
RESTORE_HOST_SPEC_CTRL
/*
@@ -361,30 +365,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
*/
UNTRAIN_RET_VM
- /* "Pop" @spec_ctrl_intercepted. */
- pop %_ASM_BX
-
- pop %_ASM_BX
-
-#ifdef CONFIG_X86_64
- pop %r12
- pop %r13
- pop %r14
- pop %r15
-#else
- pop %esi
- pop %edi
-#endif
- pop %_ASM_BP
+ FRAME_END
RET
RESTORE_GUEST_SPEC_CTRL_BODY
- RESTORE_HOST_SPEC_CTRL_BODY
+ RESTORE_HOST_SPEC_CTRL_BODY %sil
-3: cmpb $0, _ASM_RIP(kvm_rebooting)
+3: cmpb $0, kvm_rebooting(%rip)
jne 2b
ud2
_ASM_EXTABLE(1b, 3b)
SYM_FUNC_END(__svm_sev_es_vcpu_run)
+#endif /* CONFIG_KVM_AMD_SEV */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 83843379813e..c6b4b1728006 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,20 +15,23 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(struct kvm_vcpu *vcpu),
- TP_ARGS(vcpu),
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
+ __field( bool, immediate_exit )
),
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
),
- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ __entry->immediate_exit ? "[immediate exit]" : "")
);
/*
@@ -732,13 +735,13 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
* Tracepoint for nested #vmexit because of interrupt pending
*/
TRACE_EVENT(kvm_invlpga,
- TP_PROTO(__u64 rip, int asid, u64 address),
+ TP_PROTO(__u64 rip, unsigned int asid, u64 address),
TP_ARGS(rip, asid, address),
TP_STRUCT__entry(
- __field( __u64, rip )
- __field( int, asid )
- __field( __u64, address )
+ __field( __u64, rip )
+ __field( unsigned int, asid )
+ __field( __u64, address )
),
TP_fast_assign(
@@ -747,7 +750,7 @@ TRACE_EVENT(kvm_invlpga,
__entry->address = address;
),
- TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
+ TP_printk("rip: 0x%016llx asid: %u address: 0x%016llx",
__entry->rip, __entry->asid, __entry->address)
);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6329a306856b..d05ddf751491 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3606,7 +3606,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
return nested_vmx_failInvalid(vcpu);
@@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 315c7c2ba89b..be40474de6e4 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -20,53 +20,19 @@
#include "nested.h"
#include "pmu.h"
-#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-
-enum intel_pmu_architectural_events {
- /*
- * The order of the architectural events matters as support for each
- * event is enumerated via CPUID using the index of the event.
- */
- INTEL_ARCH_CPU_CYCLES,
- INTEL_ARCH_INSTRUCTIONS_RETIRED,
- INTEL_ARCH_REFERENCE_CYCLES,
- INTEL_ARCH_LLC_REFERENCES,
- INTEL_ARCH_LLC_MISSES,
- INTEL_ARCH_BRANCHES_RETIRED,
- INTEL_ARCH_BRANCHES_MISPREDICTED,
-
- NR_REAL_INTEL_ARCH_EVENTS,
-
- /*
- * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a.
- * TSC reference cycles. The architectural reference cycles event may
- * or may not actually use the TSC as the reference, e.g. might use the
- * core crystal clock or the bus clock (yeah, "architectural").
- */
- PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS,
- NR_INTEL_ARCH_EVENTS,
-};
+/*
+ * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
+ * to encode the "type" of counter to read, i.e. this is not a "base". And to
+ * further confuse things, non-architectural PMUs use bit 31 as a flag for
+ * "fast" reads, whereas the "type" is an explicit value.
+ */
+#define INTEL_RDPMC_GP 0
+#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE
-static struct {
- u8 eventsel;
- u8 unit_mask;
-} const intel_arch_events[] = {
- [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 },
- [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 },
- [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 },
- [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f },
- [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 },
- [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 },
- [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 },
- [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 },
-};
+#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16)
+#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0)
-/* mapping between fixed pmc index and intel_arch_events array */
-static int fixed_pmc_events[] = {
- [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED,
- [1] = INTEL_ARCH_CPU_CYCLES,
- [2] = PSEUDO_ARCH_REFERENCE_CYCLES,
-};
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
@@ -84,77 +50,61 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
- __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
+ __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use);
kvm_pmu_request_counter_reprogram(pmc);
}
}
-static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
-{
- if (pmc_idx < INTEL_PMC_IDX_FIXED) {
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
- MSR_P6_EVNTSEL0);
- } else {
- u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
-
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
- }
-}
-
-static bool intel_hw_event_available(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS);
-
- /*
- * Disallow events reported as unavailable in guest CPUID. Note, this
- * doesn't apply to pseudo-architectural events.
- */
- for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) {
- if (intel_arch_events[i].eventsel != event_select ||
- intel_arch_events[i].unit_mask != unit_mask)
- continue;
-
- return pmu->available_event_types & BIT(i);
- }
-
- return true;
-}
-
-static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
-
- idx &= ~(3u << 30);
-
- return fixed ? idx < pmu->nr_arch_fixed_counters
- : idx < pmu->nr_arch_gp_counters;
-}
-
static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask)
{
+ unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
unsigned int num_counters;
+ u64 bitmask;
- idx &= ~(3u << 30);
- if (fixed) {
+ /*
+ * The encoding of ECX for RDPMC is different for architectural versus
+ * non-architecturals PMUs (PMUs with version '0'). For architectural
+ * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC
+ * index. For non-architectural PMUs, bit 31 is a "fast" flag, and
+ * bits 30:0 specify the PMC index.
+ *
+ * Yell and reject attempts to read PMCs for a non-architectural PMU,
+ * as KVM doesn't support such PMUs.
+ */
+ if (WARN_ON_ONCE(!pmu->version))
+ return NULL;
+
+ /*
+ * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs
+ * are supported on all architectural PMUs, i.e. on all virtual PMUs
+ * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+,
+ * but the type itself is still valid, i.e. let RDPMC fail due to
+ * accessing a non-existent counter. Reject attempts to read all other
+ * types, which are unknown/unsupported.
+ */
+ switch (type) {
+ case INTEL_RDPMC_FIXED:
counters = pmu->fixed_counters;
num_counters = pmu->nr_arch_fixed_counters;
- } else {
+ bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ case INTEL_RDPMC_GP:
counters = pmu->gp_counters;
num_counters = pmu->nr_arch_gp_counters;
+ bitmask = pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ default:
+ return NULL;
}
+
+ idx &= INTEL_RDPMC_INDEX_MASK;
if (idx >= num_counters)
return NULL;
- *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+
+ *mask &= bitmask;
return &counters[array_index_nospec(idx, num_counters)];
}
@@ -464,20 +414,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+/*
+ * Map fixed counter events to architectural general purpose event encodings.
+ * Perf doesn't provide APIs to allow KVM to directly program a fixed counter,
+ * and so KVM instead programs the architectural event to effectively request
+ * the fixed counter. Perf isn't guaranteed to use a fixed counter and may
+ * instead program the encoding into a general purpose counter, e.g. if a
+ * different perf_event is already utilizing the requested counter, but the end
+ * result is the same (ignoring the fact that using a general purpose counter
+ * will likely exacerbate counter contention).
+ *
+ * Forcibly inlined to allow asserting on @index at build time, and there should
+ * never be more than one user.
+ */
+static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
{
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED);
+ const enum perf_hw_id fixed_pmc_perf_ids[] = {
+ [0] = PERF_COUNT_HW_INSTRUCTIONS,
+ [1] = PERF_COUNT_HW_CPU_CYCLES,
+ [2] = PERF_COUNT_HW_REF_CPU_CYCLES,
+ };
+ u64 eventsel;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- int index = array_index_nospec(i, KVM_PMC_MAX_FIXED);
- struct kvm_pmc *pmc = &pmu->fixed_counters[index];
- u32 event = fixed_pmc_events[index];
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
- pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
- intel_arch_events[event].eventsel;
- }
+ /*
+ * Yell if perf reports support for a fixed counter but perf doesn't
+ * have a known encoding for the associated general purpose event.
+ */
+ eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]);
+ WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed);
+ return eventsel;
}
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
@@ -491,19 +459,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
u64 counter_mask;
int i;
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
- pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
-
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
/*
@@ -515,8 +470,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
entry = kvm_find_cpuid_entry(vcpu, 0xa);
- if (!entry || !vcpu->kvm->arch.enable_pmu)
+ if (!entry)
return;
+
eax.full = entry->eax;
edx.full = entry->edx;
@@ -543,13 +499,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
- setup_fixed_pmc_eventsel(pmu);
}
for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
pmu->global_ctrl_mask = counter_mask;
/*
@@ -580,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
if (cpuid_model_is_consistent(vcpu) &&
(perf_capabilities & PMU_CAP_LBR_FMT))
- x86_perf_get_lbr(&lbr_desc->records);
+ memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
else
lbr_desc->records.nr = 0;
@@ -593,7 +548,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+ ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
}
pmu->pebs_data_cfg_mask = ~0xff00000full;
} else {
@@ -619,8 +574,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
pmu->fixed_counters[i].current_config = 0;
+ pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i);
}
lbr_desc->records.nr = 0;
@@ -748,11 +704,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
struct kvm_pmc *pmc = NULL;
int bit, hw_idx;
- for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
- X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
-
- if (!pmc || !pmc_speculative_in_use(pmc) ||
+ kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) {
+ if (!pmc_speculative_in_use(pmc) ||
!pmc_is_globally_enabled(pmc) || !pmc->perf_event)
continue;
@@ -767,11 +720,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
}
struct kvm_pmu_ops intel_pmu_ops __initdata = {
- .hw_event_available = intel_hw_event_available,
- .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index af662312fd07..ec08fa3caf43 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest;
- new.sn = 0;
+ __pi_clear_sn(&new);
/*
* Restore the notification vector; in the blocking case, the
@@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+ WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
old.control = READ_ONCE(pi_desc->control);
do {
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..6b2a0226257e 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -1,98 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
-
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-#define PID_TABLE_ENTRY_VALID 1
-
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 2bfbf758d061..f6986dee6f8c 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
call vmx_spec_ctrl_restore_host
+ CLEAR_BRANCH_HISTORY_VMEXIT
+
/* Put return value in AX */
mov %_ASM_BX, %_ASM_AX
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 305237dcba88..becefaf95cab 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -50,6 +50,8 @@
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
#include "cpuid.h"
#include "hyperv.h"
@@ -68,6 +70,7 @@
#include "x86.h"
#include "smm.h"
#include "vmx_onhyperv.h"
+#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -160,7 +163,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
/*
* List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic and PT MSRs are handled specially.
+ * In addition to these x2apic, PT and LBR MSRs are handled specially.
*/
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
@@ -216,6 +219,8 @@ module_param(ple_window_max, uint, 0444);
int __read_mostly pt_mode = PT_MODE_SYSTEM;
module_param(pt_mode, int, S_IRUGO);
+struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
+
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
@@ -668,25 +673,14 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int possible_passthrough_msr_slot(u32 msr)
+static int vmx_get_passthrough_msr_slot(u32 msr)
{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
-
- return -ENOENT;
-}
-
-static bool is_valid_passthrough_msr(u32 msr)
-{
- bool r;
+ int i;
switch (msr) {
case 0x800 ... 0x8ff:
/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return true;
+ return -ENOENT;
case MSR_IA32_RTIT_STATUS:
case MSR_IA32_RTIT_OUTPUT_BASE:
case MSR_IA32_RTIT_OUTPUT_MASK:
@@ -701,14 +695,16 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return true;
+ return -ENOENT;
}
- r = possible_passthrough_msr_slot(msr) != -ENOENT;
-
- WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+ }
- return r;
+ WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ return -ENOENT;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
@@ -1291,8 +1287,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -3964,6 +3958,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3973,16 +3968,13 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filters change.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
}
if ((type & MSR_TYPE_R) &&
@@ -4008,6 +4000,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -4017,16 +4010,13 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filter changes.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
}
if (type & MSR_TYPE_R)
@@ -4137,6 +4127,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
/*
* Redo intercept permissions for MSRs that KVM is passing through to
* the guest. Disabling interception will check the new MSR filter and
@@ -4852,7 +4845,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* or POSTED_INTR_WAKEUP_VECTOR.
*/
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ __pi_set_sn(&vmx->pi_desc);
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5576,10 +5569,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
err = 0;
} else {
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
@@ -6001,22 +5991,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -6519,7 +6533,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -7158,13 +7172,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7217,13 +7231,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
barrier_nospec();
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7286,7 +7309,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7313,7 +7336,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -7372,7 +7395,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
@@ -7436,10 +7461,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
@@ -7843,10 +7865,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
-static u64 vmx_get_perf_capabilities(void)
+static __init u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
u64 host_perf_cap = 0;
if (!enable_pmu)
@@ -7856,15 +7877,43 @@ static u64 vmx_get_perf_capabilities(void)
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
- x86_perf_get_lbr(&lbr);
- if (lbr.nr)
+ x86_perf_get_lbr(&vmx_lbr_caps);
+
+ /*
+ * KVM requires LBR callstack support, as the overhead due to
+ * context switching LBRs without said support is too high.
+ * See intel_pmu_create_guest_lbr_event() for more info.
+ */
+ if (!vmx_lbr_caps.has_callstack)
+ memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
+ else if (vmx_lbr_caps.nr)
perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
}
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
}
return perf_cap;
@@ -7919,11 +7968,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -8376,8 +8420,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .request_immediate_exit = vmx_request_immediate_exit,
-
.sched_in = vmx_sched_in,
.cpu_dirty_log_size = PML_ENTITY_NUM,
@@ -8637,7 +8679,6 @@ static __init int hardware_setup(void)
if (!enable_preemption_timer) {
vmx_x86_ops.set_hv_timer = NULL;
vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..7e483366b31e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -7,14 +7,15 @@
#include <asm/kvm.h>
#include <asm/intel_pt.h>
#include <asm/perf_event.h>
+#include <asm/posted_intr.h>
#include "capabilities.h"
#include "../kvm_cache_regs.h"
-#include "posted_intr.h"
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
#include "run_flags.h"
+#include "../mmu.h"
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
@@ -109,6 +110,8 @@ struct lbr_desc {
bool msr_passthrough;
};
+extern struct x86_pmu_lbr vmx_lbr_caps;
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -332,8 +335,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- bool req_immediate_exit;
-
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
@@ -721,7 +722,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
if (!enable_ept)
return true;
- return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+ return allow_smaller_maxphyaddr &&
+ cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffe580169c93..91478b769af0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[array_index_nospec(dr, size)];
- break;
+ return vcpu->arch.db[array_index_nospec(dr, size)];
case 4:
case 6:
- *val = vcpu->arch.dr6;
- break;
+ return vcpu->arch.dr6;
case 5:
default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
+ return vcpu->arch.dr7;
}
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1624,7 +1621,7 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
return v * clock->mult;
}
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
return mode;
}
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
return mode;
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
- tsc_timestamp));
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
@@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -3422,7 +3470,7 @@ static bool is_mci_status_msr(u32 msr)
static bool can_set_mci_status(struct kvm_vcpu *vcpu)
{
/* McStatusWrEn enabled? */
- if (guest_cpuid_is_amd_or_hygon(vcpu))
+ if (guest_cpuid_is_amd_compatible(vcpu))
return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
return false;
@@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
- KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
int idx;
if (vcpu->preempted) {
- if (!vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
/*
* Take the srcu lock as memslots will be accessed to check the gfn
@@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- unsigned long val;
+ unsigned int i;
memset(dbgregs, 0, sizeof(*dbgregs));
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- kvm_get_dr(vcpu, 6, &val);
- dbgregs->dr6 = val;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
+ unsigned int i;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
@@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
{
- kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
- u32 pmc)
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
{
- if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
- return 0;
- return -EINVAL;
+ return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
.get_msr = emulator_get_msr,
- .check_pmc = emulator_check_pmc,
+ .check_rdpmc_early = emulator_check_rdpmc_early,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
- /* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->root_role.direct) {
- unsigned int indirect_shadow_pages;
-
- write_lock(&vcpu->kvm->mmu_lock);
- indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- write_unlock(&vcpu->kvm->mmu_lock);
-
- if (indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
- return true;
- }
-
/*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ if (vcpu->kvm->arch.indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
/*
- * If the access faults on its page table, it can not
- * be fixed by unprotecting shadow page and it should
- * be reported to userspace.
+ * If the failed instruction faulted on an access to page tables that
+ * are used to translate any part of the instruction, KVM can't resolve
+ * the issue by unprotecting the gfn, as zapping the shadow page will
+ * result in the instruction taking a !PRESENT page fault and thus put
+ * the vCPU into an infinite loop of page faults. E.g. KVM will create
+ * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+ * then zap the SPTE to unprotect the gfn, and then do it all over
+ * again. Report the error to userspace.
*/
return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
@@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -9235,9 +9279,9 @@ writeback:
*/
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
if (ctxt->is_branch)
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
*(int *)ret = kvm_x86_check_processor_compatibility();
}
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
u64 host_pat;
int r, cpu;
+ guard(mutex)(&vendor_module_lock);
+
if (kvm_x86_ops.hardware_enable) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
@@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
}
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
- int r;
-
- mutex_lock(&vendor_module_lock);
- r = __kvm_x86_vendor_init(ops);
- mutex_unlock(&vendor_module_lock);
-
- return r;
-}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
void kvm_x86_vendor_exit(void)
@@ -10689,12 +10724,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10944,10 +10973,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit) {
+ if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
- }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,7 +11005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -12065,7 +12092,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -12076,27 +12103,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- if (irqchip_in_kernel(vcpu->kvm)) {
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
-
- /*
- * Defer evaluating inhibits until the vCPU is first run, as
- * this vCPU will not get notified of any changes until this
- * vCPU is visible to other vCPUs (marked online and added to
- * the set of vCPUs). Opportunistically mark APICv active as
- * VMX in particularly is highly unlikely to have inhibits.
- * Ignore the current per-VM APICv state so that vCPU creation
- * is guaranteed to run with a deterministic value, the request
- * will ensure the vCPU gets the correct state before VM-Entry.
- */
- if (enable_apicv) {
- vcpu->arch.apic->apicv_active = true;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- }
- } else
- static_branch_inc(&kvm_has_noapic_vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
r = -ENOMEM;
@@ -12217,8 +12226,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
- if (!lapic_in_kernel(vcpu))
- static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12495,9 +12502,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13100,11 +13104,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
+ return kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
- return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
}
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13127,9 +13133,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- if (vcpu != kvm_get_running_vcpu())
- return vcpu->arch.preempted_in_kernel;
-
return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
@@ -13924,9 +13927,6 @@ module_init(kvm_x86_init);
static void __exit kvm_x86_exit(void)
{
- /*
- * If module_init() is implemented, module_exit() must also be
- * implemented to allow module unload.
- */
+ WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
}
module_exit(kvm_x86_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2f7e19166658..a8b71803777b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
@@ -431,12 +432,6 @@ static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
}
-enum kvm_intr_type {
- /* Values are arbitrary, but must be non-zero. */
- KVM_HANDLING_IRQ = 1,
- KVM_HANDLING_NMI,
-};
-
static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
enum kvm_intr_type intr)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b4e738c6f1b..f65b35a05d91 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -10,7 +10,7 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
-#include "lapic.h"
+#include "irq.h"
#include <linux/eventfd.h>
#include <linux/kvm_host.h>
@@ -24,6 +24,7 @@
#include <xen/interface/sched.h>
#include <asm/xen/cpuid.h>
+#include <asm/pvclock.h>
#include "cpuid.h"
#include "trace.h"
@@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static int kvm_xen_shared_info_init(struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct pvclock_wall_clock *wc;
- gpa_t gpa = gfn_to_gpa(gfn);
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == KVM_XEN_INVALID_GFN) {
- kvm_gpc_deactivate(gpc);
- goto out;
- }
+ read_lock_irq(&gpc->lock);
+ while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock_irq(&gpc->lock);
- do {
- ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
if (ret)
goto out;
- /*
- * This code mirrors kvm_write_wall_clock() except that it writes
- * directly through the pfn cache and doesn't mark the page dirty.
- */
- wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
- /* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
+ }
- if (gpc->valid)
- break;
-
- read_unlock_irq(&gpc->lock);
- } while (1);
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
+ bool linux_wa)
{
+ int64_t kernel_now, delta;
+ uint64_t guest_now;
+
+ /*
+ * The guest provides the requested timeout in absolute nanoseconds
+ * of the KVM clock — as *it* sees it, based on the scaled TSC and
+ * the pvclock information provided by KVM.
+ *
+ * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
+ * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
+ * difference won't matter much as there is no cumulative effect.
+ *
+ * Calculate the time for some arbitrary point in time around "now"
+ * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
+ * delta between the kvmclock "now" value and the guest's requested
+ * timeout, apply the "Linux workaround" described below, and add
+ * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
+ * the absolute CLOCK_MONOTONIC time at which the timer should
+ * fire.
+ */
+ if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
+ static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ uint64_t host_tsc, guest_tsc;
+
+ if (!IS_ENABLED(CONFIG_64BIT) ||
+ !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
+ /*
+ * Don't fall back to get_kvmclock_ns() because it's
+ * broken; it has a systemic error in its results
+ * because it scales directly from host TSC to
+ * nanoseconds, and doesn't scale first to guest TSC
+ * and *then* to nanoseconds as the guest does.
+ *
+ * There is a small error introduced here because time
+ * continues to elapse between the ktime_get() and the
+ * subsequent rdtsc(). But not the systemic drift due
+ * to get_kvmclock_ns().
+ */
+ kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
+ host_tsc = rdtsc();
+ }
+
+ /* Calculate the guest kvmclock as the guest would do it. */
+ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
+ guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
+ guest_tsc);
+ } else {
+ /*
+ * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
+ *
+ * Also if the guest PV clock hasn't been set up yet, as is
+ * likely to be the case during migration when the vCPU has
+ * not been run yet. It would be possible to calculate the
+ * scaling factors properly in that case but there's not much
+ * point in doing so. The get_kvmclock_ns() drift accumulates
+ * over time, so it's OK to use it at startup. Besides, on
+ * migration there's going to be a little bit of skew in the
+ * precise moment at which timers fire anyway. Often they'll
+ * be in the "past" by the time the VM is running again after
+ * migration.
+ */
+ guest_now = get_kvmclock_ns(vcpu->kvm);
+ kernel_now = ktime_get();
+ }
+
+ delta = guest_abs - guest_now;
+
+ /*
+ * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
+ * negative absolute timeout values (caused by integer overflow), and
+ * for values about 13 days in the future (2^50ns) which would be
+ * caused by jiffies overflow. For those cases, Xen sets the timeout
+ * 100ms in the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep churning CPU
+ * time by waking it up). Emulate Xen's workaround when starting the
+ * timer in response to __HYPERVISOR_set_timer_op.
+ */
+ if (linux_wa &&
+ unlikely((int64_t)guest_abs < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ guest_abs = guest_now + delta;
+ }
+
/*
* Avoid races with the old timer firing. Checking timer_expires
* to avoid calling hrtimer_cancel() will only have false positives
@@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
- if (delta_ns <= 0) {
+ if (delta <= 0)
xen_timer_callback(&vcpu->arch.xen.timer);
- } else {
- ktime_t ktime_now = ktime_get();
+ else
hrtimer_start(&vcpu->arch.xen.timer,
- ktime_add_ns(ktime_now, delta_ns),
+ ktime_add_ns(kernel_now, delta),
HRTIMER_MODE_ABS_HARD);
- }
}
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2)
+ if (user_len2) {
+ kvm_gpc_mark_dirty_in_slot(gpc2);
read_unlock(&gpc2->lock);
+ }
+ kvm_gpc_mark_dirty_in_slot(gpc1);
read_unlock_irqrestore(&gpc1->lock, flags);
-
- mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
- if (user_len2)
- mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
- int r;
irq.dest_id = v->vcpu_id;
irq.vector = v->arch.xen.upcall_vector;
@@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
irq.delivery_mode = APIC_DM_FIXED;
irq.level = 1;
- /* The fast version will always work for physical unicast */
- WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+ kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
}
/*
@@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
kvm_xen_inject_vcpu_vector(v);
-
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place. Whilst it's not necessary to do this
+ * unless the mode is actually changed, it does no harm
+ * to make the call anyway.
+ */
+ r = kvm->arch.xen.shinfo_cache.active ?
+ kvm_xen_shared_info_init(kvm) : 0;
mutex_unlock(&kvm->arch.xen.xen_lock);
- r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
+ int idx;
+
mutex_lock(&kvm->arch.xen.xen_lock);
- r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+ gfn_t gfn = data->u.shared_info.gfn;
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+ gfn_to_gpa(gfn), PAGE_SIZE);
+ }
+ } else {
+ void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
+
+ if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ r = -EINVAL;
+ } else if (!hva) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+ (unsigned long)hva, PAGE_SIZE);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!r && kvm->arch.xen.shinfo_cache.active)
+ r = kvm_xen_shared_info_init(kvm);
+
mutex_unlock(&kvm->arch.xen.xen_lock);
break;
-
+ }
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_cache.active)
+ if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+ else
+ data->u.shared_info.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = kvm->arch.xen.upcall_vector;
r = 0;
@@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
/* No compat necessary here. */
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == KVM_XEN_INVALID_GPA) {
- kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
- r = 0;
- break;
+ if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ } else {
+ if (data->u.hva == 0) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.hva, sizeof(struct vcpu_info));
}
- r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
/* Start the timer if the new value has a valid vector+expiry. */
if (data->u.timer.port && data->u.timer.expires_ns)
- kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
- data->u.timer.expires_ns -
- get_kvmclock_ns(vcpu->kvm));
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
r = 0;
break;
@@ -977,13 +1110,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_cache.active)
+ if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
+ data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
+ else
+ data->u.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
@@ -1093,9 +1234,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
bool lm = is_long_mode(vcpu);
+ int r = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ if (kvm->arch.xen.long_mode != lm) {
+ kvm->arch.xen.long_mode = lm;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place.
+ */
+ if (kvm->arch.xen.shinfo_cache.active &&
+ kvm_xen_shared_info_init(kvm))
+ r = 1;
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
- /* Latch long_mode for shared_info pages etc. */
- vcpu->kvm->arch.xen.long_mode = lm;
+ if (r)
+ return r;
/*
* If Xen hypercall intercept is enabled, fill the hypercall
@@ -1396,7 +1552,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
{
struct vcpu_set_singleshot_timer oneshot;
struct x86_exception e;
- s64 delta;
if (!kvm_xen_timer_enabled(vcpu))
return false;
@@ -1430,9 +1585,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
return true;
}
- /* A delta <= 0 results in an immediate callback, which is what we want */
- delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
- kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
*r = 0;
return true;
@@ -1455,29 +1608,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
if (!kvm_xen_timer_enabled(vcpu))
return false;
- if (timeout) {
- uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
- int64_t delta = timeout - guest_now;
-
- /* Xen has a 'Linux workaround' in do_set_timer_op() which
- * checks for negative absolute timeout values (caused by
- * integer overflow), and for values about 13 days in the
- * future (2^50ns) which would be caused by jiffies
- * overflow. For those cases, it sets the timeout 100ms in
- * the future (not *too* soon, since if a guest really did
- * set a long timeout on purpose we don't want to keep
- * churning CPU time by waking it up).
- */
- if (unlikely((int64_t)timeout < 0 ||
- (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
- delta = 100 * NSEC_PER_MSEC;
- timeout = guest_now + delta;
- }
-
- kvm_xen_start_timer(vcpu, timeout, delta);
- } else {
+ if (timeout)
+ kvm_xen_start_timer(vcpu, timeout, true);
+ else
kvm_xen_stop_timer(vcpu);
- }
*r = 0;
return true;
@@ -1621,9 +1755,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
}
- if (!vcpu->arch.xen.vcpu_info_cache.active)
- return -EINVAL;
-
if (xe->port >= max_evtchn_port(kvm))
return -EINVAL;
@@ -1731,8 +1862,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- mutex_lock(&kvm->arch.xen.xen_lock);
-
/*
* It is theoretically possible for the page to be unmapped
* and the MMU notifier to invalidate the shared_info before
@@ -1760,8 +1889,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->arch.xen.xen_lock);
-
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -2109,14 +2236,10 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
- kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -2159,7 +2282,7 @@ void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f8f1fe22d090..f5841d9000ae 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue);
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
}
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 721b528da9ac..391059b2c6fb 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -163,6 +163,7 @@ SYM_CODE_START_NOALIGN(srso_alias_untrain_ret)
lfence
jmp srso_alias_return_thunk
SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
.popsection
.pushsection .text..__x86.rethunk_safe
@@ -224,10 +225,16 @@ SYM_CODE_START(srso_return_thunk)
SYM_CODE_END(srso_return_thunk)
#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret"
-#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret"
#else /* !CONFIG_MITIGATION_SRSO */
+/* Dummy for the alternative in CALL_UNTRAIN_RET. */
+SYM_CODE_START(srso_alias_untrain_ret)
+ ANNOTATE_UNRET_SAFE
+ ANNOTATE_NOENDBR
+ ret
+ int3
+SYM_FUNC_END(srso_alias_untrain_ret)
+__EXPORT_THUNK(srso_alias_untrain_ret)
#define JMP_SRSO_UNTRAIN_RET "ud2"
-#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2"
#endif /* CONFIG_MITIGATION_SRSO */
#ifdef CONFIG_MITIGATION_UNRET_ENTRY
@@ -319,9 +326,7 @@ SYM_FUNC_END(retbleed_untrain_ret)
#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
SYM_FUNC_START(entry_untrain_ret)
- ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \
- JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO, \
- JMP_SRSO_ALIAS_UNTRAIN_RET, X86_FEATURE_SRSO_ALIAS
+ ALTERNATIVE JMP_RETBLEED_UNTRAIN_RET, JMP_SRSO_UNTRAIN_RET, X86_FEATURE_SRSO
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
@@ -377,8 +382,15 @@ SYM_FUNC_END(call_depth_return_thunk)
SYM_CODE_START(__x86_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
+#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || \
+ defined(CONFIG_MITIGATION_SRSO) || \
+ defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)
ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \
"jmp warn_thunk_thunk", X86_FEATURE_ALWAYS
+#else
+ ANNOTATE_UNRET_SAFE
+ ret
+#endif
int3
SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c
index 1b118fd93140..39423ec409e1 100644
--- a/arch/x86/math-emu/fpu_etc.c
+++ b/arch/x86/math-emu/fpu_etc.c
@@ -120,9 +120,14 @@ static void fxam(FPU_REG *st0_ptr, u_char st0tag)
setcc(c);
}
+static void FPU_ST0_illegal(FPU_REG *st0_ptr, u_char st0_tag)
+{
+ FPU_illegal();
+}
+
static FUNC_ST0 const fp_etc_table[] = {
- fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal,
- ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal
+ fchs, fabs, FPU_ST0_illegal, FPU_ST0_illegal,
+ ftst_, fxam, FPU_ST0_illegal, FPU_ST0_illegal,
};
void FPU_etc(void)
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 990d847ae902..85daf98c81c3 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -433,13 +433,13 @@ static void fxtract(FPU_REG *st0_ptr, u_char st0_tag)
#endif /* PARANOID */
}
-static void fdecstp(void)
+static void fdecstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top--;
}
-static void fincstp(void)
+static void fincstp(FPU_REG *st0_ptr, u_char st0_tag)
{
clear_C1();
top++;
@@ -1631,7 +1631,7 @@ static void fscale(FPU_REG *st0_ptr, u_char st0_tag)
static FUNC_ST0 const trig_table_a[] = {
f2xm1, fyl2x, fptan, fpatan,
- fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp
+ fxtract, fprem1, fdecstp, fincstp,
};
void FPU_triga(void)
diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c
index 742619e94bdf..003a0b2753e6 100644
--- a/arch/x86/math-emu/reg_constant.c
+++ b/arch/x86/math-emu/reg_constant.c
@@ -108,8 +108,13 @@ static void fldz(int rc)
typedef void (*FUNC_RC) (int);
+static void FPU_RC_illegal(int unused)
+{
+ FPU_illegal();
+}
+
static FUNC_RC constants_table[] = {
- fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal
+ fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, FPU_RC_illegal
};
void fconst(void)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index b7b88c1d91ec..89079ea73e65 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -362,9 +362,9 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
-static void ptdump_walk_pgd_level_core(struct seq_file *m,
- struct mm_struct *mm, pgd_t *pgd,
- bool checkwx, bool dmesg)
+bool ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
+ bool checkwx, bool dmesg)
{
const struct ptdump_range ptdump_ranges[] = {
#ifdef CONFIG_X86_64
@@ -391,12 +391,17 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
ptdump_walk_pgd(&st.ptdump, mm, pgd);
if (!checkwx)
- return;
- if (st.wx_pages)
+ return true;
+ if (st.wx_pages) {
pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
st.wx_pages);
- else
+
+ return false;
+ } else {
pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
+
+ return true;
+ }
}
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
@@ -431,9 +436,12 @@ void ptdump_walk_user_pgd_level_checkwx(void)
#endif
}
-void ptdump_walk_pgd_level_checkwx(void)
+bool ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return true;
+
+ return ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
}
static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index b522933bfa56..51986e8a9d35 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -164,13 +164,6 @@ static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
-static bool ex_handler_copy(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
-{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
- return ex_handler_fault(fixup, regs, trapnr);
-}
-
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
@@ -341,8 +334,6 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
return ex_handler_uaccess(e, regs, trapnr, fault_addr);
- case EX_TYPE_COPY:
- return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 402e08f6b7ec..f26ecabc9424 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -252,7 +252,7 @@ static noinline int vmalloc_fault(unsigned long address)
if (!pmd_k)
return -1;
- if (pmd_large(*pmd_k))
+ if (pmd_leaf(*pmd_k))
return 0;
pte_k = pte_offset_kernel(pmd_k, address);
@@ -321,7 +321,7 @@ static void dump_pagetable(unsigned long address)
* And let's rather not kmap-atomic the pte, just in case
* it's allocated already:
*/
- if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+ if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd))
goto out;
pte = pte_offset_kernel(pmd, address);
@@ -370,7 +370,7 @@ static void dump_pagetable(unsigned long address)
goto bad;
pr_cont("P4D %lx ", p4d_val(*p4d));
- if (!p4d_present(*p4d) || p4d_large(*p4d))
+ if (!p4d_present(*p4d) || p4d_leaf(*p4d))
goto out;
pud = pud_offset(p4d, address);
@@ -378,7 +378,7 @@ static void dump_pagetable(unsigned long address)
goto bad;
pr_cont("PUD %lx ", pud_val(*pud));
- if (!pud_present(*pud) || pud_large(*pud))
+ if (!pud_present(*pud) || pud_leaf(*pud))
goto out;
pmd = pmd_offset(pud, address);
@@ -386,7 +386,7 @@ static void dump_pagetable(unsigned long address)
goto bad;
pr_cont("PMD %lx ", pmd_val(*pmd));
- if (!pmd_present(*pmd) || pmd_large(*pmd))
+ if (!pmd_present(*pmd) || pmd_leaf(*pmd))
goto out;
pte = pte_offset_kernel(pmd, address);
@@ -514,18 +514,19 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
if (error_code & X86_PF_INSTR) {
unsigned int level;
+ bool nx, rw;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(address);
- pte = lookup_address_in_pgd(pgd, address, &level);
+ pte = lookup_address_in_pgd_attr(pgd, address, &level, &nx, &rw);
- if (pte && pte_present(*pte) && !pte_exec(*pte))
+ if (pte && pte_present(*pte) && (!pte_exec(*pte) || nx))
pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
from_kuid(&init_user_ns, current_uid()));
- if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ if (pte && pte_present(*pte) && pte_exec(*pte) && !nx &&
(pgd_flags(*pgd) & _PAGE_USER) &&
(__read_cr4() & X86_CR4_SMEP))
pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
@@ -723,39 +724,8 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
WARN_ON_ONCE(user_mode(regs));
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- if (si_code == SEGV_PKUERR) {
- force_sig_pkuerr((void __user *)address, pkey);
- } else {
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address))
return;
- }
/*
* AMD erratum #91 manifests as a spurious page fault on a PREFETCH
@@ -1036,21 +1006,21 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address)
if (!p4d_present(*p4d))
return 0;
- if (p4d_large(*p4d))
+ if (p4d_leaf(*p4d))
return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return spurious_kernel_fault_check(error_code, (pte_t *) pud);
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return 0;
- if (pmd_large(*pmd))
+ if (pmd_leaf(*pmd))
return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
pte = pte_offset_kernel(pmd, address);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index f50cc210a981..968d7005f4a7 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -26,31 +26,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
for (; addr < end; addr = next) {
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
- bool use_gbpage;
next = (addr & PUD_MASK) + PUD_SIZE;
if (next > end)
next = end;
- /* if this is already a gbpage, this portion is already mapped */
- if (pud_large(*pud))
- continue;
-
- /* Is using a gbpage allowed? */
- use_gbpage = info->direct_gbpages;
-
- /* Don't use gbpage if it maps more than the requested region. */
- /* at the begining: */
- use_gbpage &= ((addr & ~PUD_MASK) == 0);
- /* ... or at the end: */
- use_gbpage &= ((next & ~PUD_MASK) == 0);
-
- /* Never overwrite existing mappings */
- use_gbpage &= !pud_present(*pud);
-
- if (use_gbpage) {
+ if (info->direct_gbpages) {
pud_t pudval;
+ if (pud_present(*pud))
+ continue;
+
+ addr &= PUD_MASK;
pudval = __pud((addr - info->offset) | info->page_flag);
set_pud(pud, pudval);
continue;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 679893ea5e68..6b43b6480354 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -261,21 +261,17 @@ static void __init probe_page_size_mask(void)
}
}
-#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
- .family = 6, \
- .model = _model, \
- }
/*
* INVLPG may not properly flush Global entries
* on these CPUs when PCIDs are enabled.
*/
static const struct x86_cpu_id invlpg_miss_ids[] = {
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
- INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
- INTEL_MATCH(INTEL_FAM6_ATOM_GRACEMONT ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
- INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ X86_MATCH_VFM(INTEL_ALDERLAKE, 0),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0),
{}
};
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b63403d7179d..ac41b1e0940d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -463,7 +463,7 @@ void __init native_pagetable_init(void)
break;
/* should not be large page here */
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
pfn, pmd, __pa(pmd));
BUG_ON(1);
@@ -800,6 +800,4 @@ void mark_rodata_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
- if (__supported_pte_mask & _PAGE_NX)
- debug_checkwx();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a0dffaca6d2b..7e177856ee4f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -530,7 +530,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
}
if (!pmd_none(*pmd)) {
- if (!pmd_large(*pmd)) {
+ if (!pmd_leaf(*pmd)) {
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
paddr_last = phys_pte_init(pte, paddr,
@@ -617,7 +617,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
}
if (!pud_none(*pud)) {
- if (!pud_large(*pud)) {
+ if (!pud_leaf(*pud)) {
pmd = pmd_offset(pud, 0);
paddr_last = phys_pmd_init(pmd, paddr,
paddr_end,
@@ -1114,7 +1114,7 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
if (!pmd_present(*pmd))
continue;
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE)) {
if (!direct)
@@ -1163,7 +1163,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!pud_present(*pud))
continue;
- if (pud_large(*pud) &&
+ if (pud_leaf(*pud) &&
IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE)) {
spin_lock(&init_mm.page_table_lock);
@@ -1197,7 +1197,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
if (!p4d_present(*p4d))
continue;
- BUILD_BUG_ON(p4d_large(*p4d));
+ BUILD_BUG_ON(p4d_leaf(*p4d));
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, altmap, direct);
@@ -1412,8 +1412,6 @@ void mark_rodata_ro(void)
(void *)text_end, (void *)rodata_start);
free_kernel_image_pages("unused kernel image (rodata/data gap)",
(void *)rodata_end, (void *)_sdata);
-
- debug_checkwx();
}
/*
@@ -1522,9 +1520,9 @@ void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
unsigned long addr, unsigned long next)
{
- int large = pmd_large(*pmd);
+ int large = pmd_leaf(*pmd);
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
vmemmap_use_sub_pmd(addr, next);
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0302491d799d..9dddf19a5571 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -95,7 +95,7 @@ static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (!pmd_large(*pmd))
+ if (!pmd_leaf(*pmd))
kasan_populate_pmd(pmd, addr, next, nid);
} while (pmd++, addr = next, addr != end);
}
@@ -115,7 +115,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (!pud_large(*pud))
+ if (!pud_leaf(*pud))
kasan_populate_pud(pud, addr, next, nid);
} while (pud++, addr = next, addr != end);
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 6f3b3e028718..0a120d85d7bb 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -102,6 +102,13 @@ void __init mem_encrypt_setup_arch(void)
phys_addr_t total_mem = memblock_phys_mem_size();
unsigned long size;
+ /*
+ * Do RMP table fixups after the e820 tables have been setup by
+ * e820__memory_setup().
+ */
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ snp_fixup_e820_tables();
+
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 70b91de2e053..422602f6039b 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -492,6 +492,24 @@ void __init sme_early_init(void)
*/
if (sev_status & MSR_AMD64_SEV_ENABLED)
ia32_disable();
+
+ /*
+ * Override init functions that scan the ROM region in SEV-SNP guests,
+ * as this memory is not pre-validated and would thus cause a crash.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.pci.init_irq = x86_init_noop;
+ x86_init.resources.probe_roms = x86_init_noop;
+
+ /*
+ * DMI setup behavior for SEV-SNP guests depends on
+ * efi_enabled(EFI_CONFIG_TABLES), which hasn't been
+ * parsed yet. snp_dmi_setup() will run after that
+ * parsing has happened.
+ */
+ x86_init.resources.dmi_setup = snp_dmi_setup;
+ }
}
void __init mem_encrypt_free_decrypted_mem(void)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 64b5005d49e5..ac33b2263a43 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -141,7 +141,7 @@ static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
set_pud(pud, __pud(PUD_FLAGS | __pa(pmd)));
}
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
return NULL;
return pud;
@@ -157,7 +157,7 @@ static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
return;
pmd = pmd_offset(pud, ppd->vaddr);
- if (pmd_large(*pmd))
+ if (pmd_leaf(*pmd))
return;
set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
@@ -181,7 +181,7 @@ static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
}
- if (pmd_large(*pmd))
+ if (pmd_leaf(*pmd))
return;
pte = pte_offset_kernel(pmd, ppd->vaddr);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..ce84ba86e69e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -929,6 +929,8 @@ int memory_add_physaddr_to_nid(u64 start)
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
static int __init cmp_memblk(const void *a, const void *b)
{
const struct numa_memblk *ma = *(const struct numa_memblk **)a;
@@ -1001,5 +1003,3 @@ int __init numa_fill_memblks(u64 start, u64 end)
}
return 0;
}
-
-#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 104544359d69..025fd7ea5d69 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
#include <linux/memblock.h>
#include <linux/init.h>
+#include <asm/pgtable_areas.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 0d72183b5dd0..36b603d0cdde 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -947,6 +947,38 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
+static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
+ pgprot_t *pgprot)
+{
+ unsigned long prot;
+
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
+
+ /*
+ * We need the starting PFN and cachemode used for track_pfn_remap()
+ * that covered the whole VMA. For most mappings, we can obtain that
+ * information from the page tables. For COW mappings, we might now
+ * suddenly have anon folios mapped and follow_phys() will fail.
+ *
+ * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
+ * detect the PFN. If we need the cachemode as well, we're out of luck
+ * for now and have to fail fork().
+ */
+ if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+ if (pgprot)
+ *pgprot = __pgprot(prot);
+ return 0;
+ }
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (pgprot)
+ return -EINVAL;
+ *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+ return 0;
+ }
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
@@ -957,20 +989,13 @@ static void free_pfn_range(u64 paddr, unsigned long size)
int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
- unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (vma->vm_flags & VM_PAT) {
- /*
- * reserve the whole chunk covered by vma. We need the
- * starting address and protection from pte.
- */
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, &pgprot))
return -EINVAL;
- }
- pgprot = __pgprot(prot);
+ /* reserve the whole chunk covered by vma. */
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
@@ -1045,7 +1070,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
- unsigned long prot;
if (vma && !(vma->vm_flags & VM_PAT))
return;
@@ -1053,11 +1077,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
- if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
- WARN_ON_ONCE(1);
+ if (get_pat_info(vma, &paddr, NULL))
return;
- }
-
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index e5b2985a7c51..19fdfbb171ed 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -619,7 +619,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
* Validate strict W^X semantics.
*/
static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
- unsigned long pfn, unsigned long npg)
+ unsigned long pfn, unsigned long npg,
+ bool nx, bool rw)
{
unsigned long end;
@@ -641,6 +642,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
return new;
+ /* Non-leaf translation entries can disable writing or execution. */
+ if (!rw || nx)
+ return new;
+
end = start + npg * PAGE_SIZE - 1;
WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
(unsigned long long)pgprot_val(old),
@@ -657,50 +662,77 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
/*
* Lookup the page table entry for a virtual address in a specific pgd.
- * Return a pointer to the entry and the level of the mapping.
+ * Return a pointer to the entry, the level of the mapping, and the effective
+ * NX and RW bits of all page table levels.
*/
-pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
- unsigned int *level)
+pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
+ unsigned int *level, bool *nx, bool *rw)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
*level = PG_LEVEL_NONE;
+ *nx = false;
+ *rw = true;
if (pgd_none(*pgd))
return NULL;
+ *nx |= pgd_flags(*pgd) & _PAGE_NX;
+ *rw &= pgd_flags(*pgd) & _PAGE_RW;
+
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
*level = PG_LEVEL_512G;
- if (p4d_large(*p4d) || !p4d_present(*p4d))
+ if (p4d_leaf(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
+ *nx |= p4d_flags(*p4d) & _PAGE_NX;
+ *rw &= p4d_flags(*p4d) & _PAGE_RW;
+
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
*level = PG_LEVEL_1G;
- if (pud_large(*pud) || !pud_present(*pud))
+ if (pud_leaf(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+ *nx |= pud_flags(*pud) & _PAGE_NX;
+ *rw &= pud_flags(*pud) & _PAGE_RW;
+
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
return NULL;
*level = PG_LEVEL_2M;
- if (pmd_large(*pmd) || !pmd_present(*pmd))
+ if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return (pte_t *)pmd;
+ *nx |= pmd_flags(*pmd) & _PAGE_NX;
+ *rw &= pmd_flags(*pmd) & _PAGE_RW;
+
*level = PG_LEVEL_4K;
return pte_offset_kernel(pmd, address);
}
/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned int *level)
+{
+ bool nx, rw;
+
+ return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
+}
+
+/*
* Lookup the page table entry for a virtual address. Return a pointer
* to the entry and the level of the mapping.
*
@@ -715,13 +747,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
EXPORT_SYMBOL_GPL(lookup_address);
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
- unsigned int *level)
+ unsigned int *level, bool *nx, bool *rw)
{
- if (cpa->pgd)
- return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
- address, level);
+ pgd_t *pgd;
+
+ if (!cpa->pgd)
+ pgd = pgd_offset_k(address);
+ else
+ pgd = cpa->pgd + pgd_index(address);
- return lookup_address(address, level);
+ return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
}
/*
@@ -739,11 +774,11 @@ pmd_t *lookup_pmd_address(unsigned long address)
return NULL;
p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+ if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, address);
- if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
return NULL;
return pmd_offset(pud, address);
@@ -849,12 +884,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
pgprot_t old_prot, new_prot, req_prot, chk_prot;
pte_t new_pte, *tmp;
enum pg_level level;
+ bool nx, rw;
/*
* Check for races, another CPU might have split this page
* up already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte)
return 1;
@@ -965,7 +1001,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
psize, CPA_DETECT);
- new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages);
+ new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
+ nx, rw);
/*
* If there is a conflict, split the large page.
@@ -1046,6 +1083,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
pte_t *pbase = (pte_t *)page_address(base);
unsigned int i, level;
pgprot_t ref_prot;
+ bool nx, rw;
pte_t *tmp;
spin_lock(&pgd_lock);
@@ -1053,7 +1091,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
* Check for races, another CPU might have split this page
* up for us already:
*/
- tmp = _lookup_address_cpa(cpa, address, &level);
+ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (tmp != kpte) {
spin_unlock(&pgd_lock);
return 1;
@@ -1233,7 +1271,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* Try to unmap in 2M chunks.
*/
while (end - start >= PMD_SIZE) {
- if (pmd_large(*pmd))
+ if (pmd_leaf(*pmd))
pmd_clear(pmd);
else
__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
@@ -1278,7 +1316,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
*/
while (end - start >= PUD_SIZE) {
- if (pud_large(*pud))
+ if (pud_leaf(*pud))
pud_clear(pud);
else
unmap_pmd_range(pud, start, start + PUD_SIZE);
@@ -1594,10 +1632,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
int do_split, err;
unsigned int level;
pte_t *kpte, old_pte;
+ bool nx, rw;
address = __cpa_addr(cpa, cpa->curpage);
repeat:
- kpte = _lookup_address_cpa(cpa, address, &level);
+ kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
if (!kpte)
return __cpa_process_fault(cpa, address, primary);
@@ -1619,7 +1658,8 @@ repeat:
new_prot = static_protections(new_prot, address, pfn, 1, 0,
CPA_PROTECT);
- new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1);
+ new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
+ nx, rw);
new_prot = pgprot_clear_protnone_bits(new_prot);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index cceb779d882d..d007591b8059 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -777,7 +777,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
*/
int pud_clear_huge(pud_t *pud)
{
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pud_clear(pud);
return 1;
}
@@ -792,7 +792,7 @@ int pud_clear_huge(pud_t *pud)
*/
int pmd_clear_huge(pmd_t *pmd)
{
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pmd_clear(pmd);
return 1;
}
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 669ba1c345b3..2e69abf4f852 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -185,7 +185,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
}
- BUILD_BUG_ON(pgd_large(*pgd) != 0);
+ BUILD_BUG_ON(pgd_leaf(*pgd) != 0);
return p4d_offset(pgd, address);
}
@@ -206,7 +206,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
if (!p4d)
return NULL;
- BUILD_BUG_ON(p4d_large(*p4d) != 0);
+ BUILD_BUG_ON(p4d_leaf(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
if (WARN_ON_ONCE(!new_pud_page))
@@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
pud = pud_offset(p4d, address);
/* The user page tables do not use large mappings: */
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
WARN_ON(1);
return NULL;
}
@@ -252,7 +252,7 @@ static pte_t *pti_user_pagetable_walk_pte(unsigned long address)
return NULL;
/* We can't do anything sensible if we hit a large mapping. */
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
WARN_ON(1);
return NULL;
}
@@ -341,7 +341,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
continue;
}
- if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
+ if (pmd_leaf(*pmd) || level == PTI_CLONE_PMD) {
target_pmd = pti_user_pagetable_walk_pmd(addr);
if (WARN_ON(!target_pmd))
return;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4af930947380..44ac64f3a047 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -299,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
write_cr3(new_mm_cr3);
}
-void leave_mm(int cpu)
+void leave_mm(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -327,7 +327,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
unsigned long flags;
local_irq_save(flags);
- switch_mm_irqs_off(prev, next, tsk);
+ switch_mm_irqs_off(NULL, next, tsk);
local_irq_restore(flags);
}
@@ -492,10 +492,16 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * This optimizes when not actually switching mm's. Some architectures use the
+ * 'unused' argument for this optimization, but x86 must use
+ * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
+ * 'current->active_mm' up to date.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
struct task_struct *tsk)
{
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
@@ -504,15 +510,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
bool need_flush;
u16 new_asid;
- /*
- * NB: The scheduler will call us with prev == next when switching
- * from lazy TLB mode to normal mode if active_mm isn't changing.
- * When this happens, we don't assume that CR3 (and hence
- * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
/* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -527,7 +524,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
@@ -559,7 +556,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (prev == next) {
/* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
@@ -574,7 +571,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(real_prev != &init_mm &&
+ if (WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -616,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
*/
- if (real_prev != &init_mm) {
+ if (prev != &init_mm) {
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(real_prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ mm_cpumask(prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
/*
@@ -656,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
- if (next != real_prev) {
+ if (next != prev) {
cr4_update_pce_mm(next);
- switch_ldt(real_prev, next);
+ switch_ldt(prev, next);
}
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a7ba8e178645..5159c7a22922 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -480,7 +480,7 @@ static int emit_call(u8 **pprog, void *func, void *ip)
static int emit_rsb_call(u8 **pprog, void *func, void *ip)
{
OPTIMIZER_HIDE_VAR(func);
- x86_call_depth_emit_accounting(pprog, func);
+ ip += x86_call_depth_emit_accounting(pprog, func, ip);
return emit_patch(pprog, func, ip, 0xE8);
}
@@ -816,9 +816,10 @@ done:
static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
const u32 imm32_hi, const u32 imm32_lo)
{
+ u64 imm64 = ((u64)imm32_hi << 32) | (u32)imm32_lo;
u8 *prog = *pprog;
- if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
+ if (is_uimm32(imm64)) {
/*
* For emitting plain u32, where sign bit must not be
* propagated LLVM tends to load imm64 over mov32
@@ -826,6 +827,8 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
* 'mov %eax, imm32' instead.
*/
emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
+ } else if (is_simm32(imm64)) {
+ emit_mov_imm32(&prog, true, dst_reg, imm32_lo);
} else {
/* movabsq rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
@@ -1169,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
return 0;
}
+static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
+ u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+ u8 *prog = *pprog;
+
+ EMIT1(0xF0); /* lock prefix */
+ switch (size) {
+ case BPF_W:
+ EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
+ break;
+ case BPF_DW:
+ EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg));
+ break;
+ default:
+ pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n");
+ return -EFAULT;
+ }
+
+ /* emit opcode */
+ switch (atomic_op) {
+ case BPF_ADD:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */
+ EMIT1(simple_alu_opcodes[atomic_op]);
+ break;
+ case BPF_ADD | BPF_FETCH:
+ /* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */
+ EMIT2(0x0F, 0xC1);
+ break;
+ case BPF_XCHG:
+ /* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */
+ EMIT1(0x87);
+ break;
+ case BPF_CMPXCHG:
+ /* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */
+ EMIT2(0x0F, 0xB1);
+ break;
+ default:
+ pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+ return -EFAULT;
+ }
+ emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+ *pprog = prog;
+ return 0;
+}
+
#define DONT_CLEAR 1
bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
@@ -1351,8 +1402,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
break;
case BPF_ALU64 | BPF_MOV | BPF_X:
- if (insn->off == BPF_ADDR_SPACE_CAST &&
- insn->imm == 1U << 16) {
+ if (insn_is_cast_user(insn)) {
if (dst_reg != src_reg)
/* 32-bit mov */
emit_mov_reg(&prog, false, dst_reg, src_reg);
@@ -1383,6 +1433,16 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
break;
+ } else if (insn_is_mov_percpu_addr(insn)) {
+ /* mov <dst>, <src> (if necessary) */
+ EMIT_mov(dst_reg, src_reg);
+#ifdef CONFIG_SMP
+ /* add <dst>, gs:[<off>] */
+ EMIT2(0x65, add_1mod(0x48, dst_reg));
+ EMIT3(0x03, add_2reg(0x04, 0, dst_reg), 0x25);
+ EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+ break;
}
fallthrough;
case BPF_ALU | BPF_MOV | BPF_X:
@@ -1807,36 +1867,41 @@ populate_extable:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
/* Conservatively check that src_reg + insn->off is a kernel address:
- * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
- * src_reg is used as scratch for src_reg += insn->off and restored
- * after emit_ldx if necessary
+ * src_reg + insn->off > TASK_SIZE_MAX + PAGE_SIZE
+ * and
+ * src_reg + insn->off < VSYSCALL_ADDR
*/
- u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE - VSYSCALL_ADDR;
u8 *end_of_jmp;
- /* At end of these emitted checks, insn->off will have been added
- * to src_reg, so no need to do relative load with insn->off offset
- */
- insn_off = 0;
+ /* movabsq r10, VSYSCALL_ADDR */
+ emit_mov_imm64(&prog, BPF_REG_AX, (long)VSYSCALL_ADDR >> 32,
+ (u32)(long)VSYSCALL_ADDR);
- /* movabsq r11, limit */
- EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
- EMIT((u32)limit, 4);
- EMIT(limit >> 32, 4);
+ /* mov src_reg, r11 */
+ EMIT_mov(AUX_REG, src_reg);
if (insn->off) {
- /* add src_reg, insn->off */
- maybe_emit_1mod(&prog, src_reg, true);
- EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
}
- /* cmp src_reg, r11 */
- maybe_emit_mod(&prog, src_reg, AUX_REG, true);
- EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* sub r11, r10 */
+ maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
+ EMIT2(0x29, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
+
+ /* movabsq r10, limit */
+ emit_mov_imm64(&prog, BPF_REG_AX, (long)limit >> 32,
+ (u32)(long)limit);
- /* if unsigned '>=', goto load */
- EMIT2(X86_JAE, 0);
+ /* cmp r10, r11 */
+ maybe_emit_mod(&prog, AUX_REG, BPF_REG_AX, true);
+ EMIT2(0x39, add_2reg(0xC0, AUX_REG, BPF_REG_AX));
+
+ /* if unsigned '>', goto load */
+ EMIT2(X86_JA, 0);
end_of_jmp = prog;
/* xor dst_reg, dst_reg */
@@ -1862,18 +1927,6 @@ populate_extable:
/* populate jmp_offset for JMP above */
start_of_ldx[-1] = prog - start_of_ldx;
- if (insn->off && src_reg != dst_reg) {
- /* sub src_reg, insn->off
- * Restore src_reg after "add src_reg, insn->off" in prev
- * if statement. But if src_reg == dst_reg, emit_ldx
- * above already clobbered src_reg, so no need to restore.
- * If add src_reg, insn->off was unnecessary, no need to
- * restore either.
- */
- maybe_emit_1mod(&prog, src_reg, true);
- EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
- }
-
if (!bpf_prog->aux->extable)
break;
@@ -1970,22 +2023,28 @@ populate_extable:
return err;
break;
+ case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+ case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+ start_of_ldx = prog;
+ err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code),
+ dst_reg, src_reg, X86_REG_R12, insn->off);
+ if (err)
+ return err;
+ goto populate_extable;
+
/* call */
case BPF_JMP | BPF_CALL: {
- int offs;
+ u8 *ip = image + addrs[i - 1];
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
- if (!imm32)
- return -EINVAL;
- offs = 7 + x86_call_depth_emit_accounting(&prog, func);
- } else {
- if (!imm32)
- return -EINVAL;
- offs = x86_call_depth_emit_accounting(&prog, func);
+ ip += 7;
}
- if (emit_call(&prog, func, image + addrs[i - 1] + offs))
+ if (!imm32)
+ return -EINVAL;
+ ip += x86_call_depth_emit_accounting(&prog, func, ip);
+ if (emit_call(&prog, func, ip))
return -EINVAL;
break;
}
@@ -2835,7 +2894,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
* Direct-call fentry stub, as such it needs accounting for the
* __fentry__ call.
*/
- x86_call_depth_emit_accounting(&prog, NULL);
+ x86_call_depth_emit_accounting(&prog, NULL, image);
}
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
@@ -3004,12 +3063,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
bpf_prog_pack_free(image, size);
}
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
{
+ return 0;
}
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
@@ -3369,6 +3425,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
return true;
}
+bool bpf_jit_supports_percpu_insn(void)
+{
+ return true;
+}
+
void bpf_jit_free(struct bpf_prog *prog)
{
if (prog->jited) {
@@ -3472,7 +3533,28 @@ bool bpf_jit_supports_arena(void)
return true;
}
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+ if (!in_arena)
+ return true;
+ switch (insn->code) {
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ if (insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))
+ return false;
+ }
+ return true;
+}
+
bool bpf_jit_supports_ptr_xchg(void)
{
return true;
}
+
+/* x86-64 JIT emits its own code to filter user addresses so return 0 here */
+u64 bpf_arch_uaddress_limit(void)
+{
+ return 0;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index c10083a8e68e..de0f9e5f9f73 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2600,8 +2600,7 @@ out_image:
if (bpf_jit_enable > 1)
bpf_jit_dump(prog->len, proglen, pass + 1, image);
- if (image) {
- bpf_jit_binary_lock_ro(header);
+ if (image && !bpf_jit_binary_lock_ro(header)) {
prog->bpf_func = (void *)image;
prog->jited = 1;
prog->jited_len = proglen;
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 87313701f069..f5dbd25651e0 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -35,12 +35,6 @@ struct sim_dev_reg {
struct sim_reg sim_reg;
};
-struct sim_reg_op {
- void (*init)(struct sim_dev_reg *reg);
- void (*read)(struct sim_dev_reg *reg, u32 value);
- void (*write)(struct sim_dev_reg *reg, u32 value);
-};
-
#define MB (1024 * 1024)
#define KB (1024)
#define SIZE_TO_MASK(size) (~(size - 1))
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index f347c20247d3..b33afb240601 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -907,6 +907,54 @@ static void chromeos_fixup_apl_pci_l1ss_capability(struct pci_dev *dev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_save_apl_pci_l1ss_capability);
DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_fixup_apl_pci_l1ss_capability);
+/*
+ * Disable D3cold on Asus B1400 PCI-NVMe bridge
+ *
+ * On this platform with VMD off, the NVMe device cannot successfully power
+ * back on from D3cold. This appears to be an untested transition by the
+ * vendor: Windows leaves the NVMe and parent bridge in D0 during suspend.
+ *
+ * We disable D3cold on the parent bridge for simplicity, and the fact that
+ * both parent bridge and NVMe device share the same power resource.
+ *
+ * This is only needed on BIOS versions before 308; the newer versions flip
+ * StorageD3Enable from 1 to 0.
+ */
+static const struct dmi_system_id asus_nvme_broken_d3cold_table[] = {
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.304"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.305"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.306"),
+ },
+ },
+ {
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_BIOS_VERSION, "B1400CEAE.307"),
+ },
+ },
+ {}
+};
+
+static void asus_disable_nvme_d3cold(struct pci_dev *pdev)
+{
+ if (dmi_check_system(asus_nvme_broken_d3cold_table) > 0)
+ pci_d3cold_disable(pdev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x9a09, asus_disable_nvme_d3cold);
+
#ifdef CONFIG_SUSPEND
/*
* Root Ports on some AMD SoCs advertise PME_Support for D3hot and D3cold, but
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index f8ed5f66cd20..6b9c6deca8ba 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -7,6 +7,9 @@
* Copyright (c) 2015, Intel Corporation.
*/
+#define pr_fmt(fmt) "punit_atom: " fmt
+
+#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/device.h>
@@ -117,6 +120,51 @@ static void punit_dbgfs_unregister(void)
debugfs_remove_recursive(punit_dbg_file);
}
+#if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
+static const struct punit_device *punit_dev;
+
+static void punit_s2idle_check(void)
+{
+ const struct punit_device *punit_devp;
+ u32 punit_pwr_status, dstate;
+ int status;
+
+ for (punit_devp = punit_dev; punit_devp->name; punit_devp++) {
+ /* Skip MIO, it is on till the very last moment */
+ if (punit_devp->reg == MIO_SS_PM)
+ continue;
+
+ status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ,
+ punit_devp->reg, &punit_pwr_status);
+ if (status) {
+ pr_err("%s read failed\n", punit_devp->name);
+ } else {
+ dstate = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+ if (!dstate)
+ pr_err("%s is in D0 prior to s2idle\n", punit_devp->name);
+ }
+ }
+}
+
+static struct acpi_s2idle_dev_ops punit_s2idle_ops = {
+ .check = punit_s2idle_check,
+};
+
+static void punit_s2idle_check_register(struct punit_device *punit_device)
+{
+ punit_dev = punit_device;
+ acpi_register_lps0_dev(&punit_s2idle_ops);
+}
+
+static void punit_s2idle_check_unregister(void)
+{
+ acpi_unregister_lps0_dev(&punit_s2idle_ops);
+}
+#else
+static void punit_s2idle_check_register(struct punit_device *punit_device) {}
+static void punit_s2idle_check_unregister(void) {}
+#endif
+
#define X86_MATCH(model, data) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_##model, \
X86_FEATURE_MWAIT, data)
@@ -131,19 +179,23 @@ MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
static int __init punit_atom_debug_init(void)
{
+ struct punit_device *punit_device;
const struct x86_cpu_id *id;
id = x86_match_cpu(intel_punit_cpu_ids);
if (!id)
return -ENODEV;
- punit_dbgfs_register((struct punit_device *)id->driver_data);
+ punit_device = (struct punit_device *)id->driver_data;
+ punit_dbgfs_register(punit_device);
+ punit_s2idle_check_register(punit_device);
return 0;
}
static void __exit punit_atom_debug_exit(void)
{
+ punit_s2idle_check_unregister();
punit_dbgfs_unregister();
}
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index f32451bdcfdd..f8126821a94d 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -139,7 +139,6 @@ void __init x86_ce4100_early_setup(void)
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.find_mptable = x86_init_noop;
x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
- x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
x86_init.pci.init = ce4100_pci_init;
x86_init.pci.init_irq = sdv_pci_init;
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index b42bfdab01a9..c5f3bbdbdcfe 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -62,11 +62,10 @@ static int iris_probe(struct platform_device *pdev)
return 0;
}
-static int iris_remove(struct platform_device *pdev)
+static void iris_remove(struct platform_device *pdev)
{
pm_power_off = old_pm_power_off;
printk(KERN_INFO "Iris power_off handler uninstalled.\n");
- return 0;
}
static struct platform_driver iris_driver = {
@@ -74,7 +73,7 @@ static struct platform_driver iris_driver = {
.name = "iris",
},
.probe = iris_probe,
- .remove = iris_remove,
+ .remove_new = iris_remove,
};
static struct resource iris_resources[] = {
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index f067ac780ba7..6a9c42de74e7 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -144,7 +144,7 @@ static int xo1_pm_probe(struct platform_device *pdev)
return 0;
}
-static int xo1_pm_remove(struct platform_device *pdev)
+static void xo1_pm_remove(struct platform_device *pdev)
{
if (strcmp(pdev->name, "cs5535-pms") == 0)
pms_base = 0;
@@ -152,7 +152,6 @@ static int xo1_pm_remove(struct platform_device *pdev)
acpi_base = 0;
pm_power_off = NULL;
- return 0;
}
static struct platform_driver cs5535_pms_driver = {
@@ -160,7 +159,7 @@ static struct platform_driver cs5535_pms_driver = {
.name = "cs5535-pms",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static struct platform_driver cs5535_acpi_driver = {
@@ -168,7 +167,7 @@ static struct platform_driver cs5535_acpi_driver = {
.name = "olpc-xo1-pm-acpi",
},
.probe = xo1_pm_probe,
- .remove = xo1_pm_remove,
+ .remove_new = xo1_pm_remove,
};
static int __init xo1_pm_init(void)
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 89f25af4b3c3..46d42ff6e18a 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -598,7 +598,7 @@ err_ebook:
return r;
}
-static int xo1_sci_remove(struct platform_device *pdev)
+static void xo1_sci_remove(struct platform_device *pdev)
{
free_irq(sci_irq, pdev);
cancel_work_sync(&sci_work);
@@ -608,7 +608,6 @@ static int xo1_sci_remove(struct platform_device *pdev)
free_ebook_switch();
free_power_button();
acpi_base = 0;
- return 0;
}
static struct platform_driver xo1_sci_driver = {
@@ -617,7 +616,7 @@ static struct platform_driver xo1_sci_driver = {
.dev_groups = lid_groups,
},
.probe = xo1_sci_probe,
- .remove = xo1_sci_remove,
+ .remove_new = xo1_sci_remove,
.suspend = xo1_sci_suspend,
.resume = xo1_sci_resume,
};
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 944e0290f2c0..8c2d4b8de25d 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -75,6 +75,9 @@ static void __init init_pvh_bootparams(bool xen_guest)
} else
xen_raw_printk("Warning: Can fit ISA range into e820\n");
+ if (xen_guest)
+ xen_reserve_extra_memory(&pvh_bootparams);
+
pvh_bootparams.hdr.cmd_line_ptr =
pvh_start_info.cmdline_paddr;
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 379777572bc9..e0cd7afd5302 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -5,7 +5,7 @@
CFLAGS_cpu.o := -fno-stack-protector
# Clang may incorrectly inline functions with stack protector enabled into
-# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479
+# __restore_processor_state(): https://llvm.org/pr47479
CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO)
obj-$(CONFIG_PM_SLEEP) += cpu.o
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index 6f955eb1e163..5b81d19cd114 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -165,17 +165,17 @@ int relocate_restore_code(void)
pgd = (pgd_t *)__va(read_cr3_pa()) +
pgd_index(relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code);
- if (p4d_large(*p4d)) {
+ if (p4d_leaf(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
goto out;
}
pud = pud_offset(p4d, relocated_restore_code);
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
goto out;
}
pmd = pmd_offset(pud, relocated_restore_code);
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
goto out;
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index bc31863c5ee6..a18591f6e6d9 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -42,7 +42,8 @@ KCOV_INSTRUMENT := n
# make up the standalone purgatory.ro
PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
-PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
+PURGATORY_CFLAGS += -fpic -fvisibility=hidden
PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
PURGATORY_CFLAGS += -fno-stack-protector
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index b029fb81ebee..c101bed61940 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -11,41 +11,42 @@
#define Elf_Shdr ElfW(Shdr)
#define Elf_Sym ElfW(Sym)
-static Elf_Ehdr ehdr;
-static unsigned long shnum;
-static unsigned int shstrndx;
-static unsigned int shsymtabndx;
-static unsigned int shxsymtabndx;
+static Elf_Ehdr ehdr;
+static unsigned long shnum;
+static unsigned int shstrndx;
+static unsigned int shsymtabndx;
+static unsigned int shxsymtabndx;
static int sym_index(Elf_Sym *sym);
struct relocs {
- uint32_t *offset;
- unsigned long count;
- unsigned long size;
+ uint32_t *offset;
+ unsigned long count;
+ unsigned long size;
};
-static struct relocs relocs16;
-static struct relocs relocs32;
+static struct relocs relocs16;
+static struct relocs relocs32;
+
#if ELF_BITS == 64
-static struct relocs relocs32neg;
-static struct relocs relocs64;
-#define FMT PRIu64
+static struct relocs relocs32neg;
+static struct relocs relocs64;
+# define FMT PRIu64
#else
-#define FMT PRIu32
+# define FMT PRIu32
#endif
struct section {
- Elf_Shdr shdr;
- struct section *link;
- Elf_Sym *symtab;
- Elf32_Word *xsymtab;
- Elf_Rel *reltab;
- char *strtab;
+ Elf_Shdr shdr;
+ struct section *link;
+ Elf_Sym *symtab;
+ Elf32_Word *xsymtab;
+ Elf_Rel *reltab;
+ char *strtab;
};
-static struct section *secs;
+static struct section *secs;
-static const char * const sym_regex_kernel[S_NSYMTYPES] = {
+static const char * const sym_regex_kernel[S_NSYMTYPES] = {
/*
* Following symbols have been audited. There values are constant and do
* not change if bzImage is loaded at a different physical address than
@@ -115,13 +116,13 @@ static const char * const sym_regex_realmode[S_NSYMTYPES] = {
"^pa_",
};
-static const char * const *sym_regex;
+static const char * const *sym_regex;
+
+static regex_t sym_regex_c[S_NSYMTYPES];
-static regex_t sym_regex_c[S_NSYMTYPES];
static int is_reloc(enum symtype type, const char *sym_name)
{
- return sym_regex[type] &&
- !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
+ return sym_regex[type] && !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0);
}
static void regex_init(int use_real_mode)
@@ -139,8 +140,7 @@ static void regex_init(int use_real_mode)
if (!sym_regex[i])
continue;
- err = regcomp(&sym_regex_c[i], sym_regex[i],
- REG_EXTENDED|REG_NOSUB);
+ err = regcomp(&sym_regex_c[i], sym_regex[i], REG_EXTENDED|REG_NOSUB);
if (err) {
regerror(err, &sym_regex_c[i], errbuf, sizeof(errbuf));
@@ -163,9 +163,10 @@ static const char *sym_type(unsigned type)
#undef SYM_TYPE
};
const char *name = "unknown sym type name";
- if (type < ARRAY_SIZE(type_name)) {
+
+ if (type < ARRAY_SIZE(type_name))
name = type_name[type];
- }
+
return name;
}
@@ -179,9 +180,10 @@ static const char *sym_bind(unsigned bind)
#undef SYM_BIND
};
const char *name = "unknown sym bind name";
- if (bind < ARRAY_SIZE(bind_name)) {
+
+ if (bind < ARRAY_SIZE(bind_name))
name = bind_name[bind];
- }
+
return name;
}
@@ -196,9 +198,10 @@ static const char *sym_visibility(unsigned visibility)
#undef SYM_VISIBILITY
};
const char *name = "unknown sym visibility name";
- if (visibility < ARRAY_SIZE(visibility_name)) {
+
+ if (visibility < ARRAY_SIZE(visibility_name))
name = visibility_name[visibility];
- }
+
return name;
}
@@ -244,9 +247,10 @@ static const char *rel_type(unsigned type)
#undef REL_TYPE
};
const char *name = "unknown type rel type name";
- if (type < ARRAY_SIZE(type_name) && type_name[type]) {
+
+ if (type < ARRAY_SIZE(type_name) && type_name[type])
name = type_name[type];
- }
+
return name;
}
@@ -256,15 +260,14 @@ static const char *sec_name(unsigned shndx)
const char *name;
sec_strtab = secs[shstrndx].strtab;
name = "<noname>";
- if (shndx < shnum) {
+
+ if (shndx < shnum)
name = sec_strtab + secs[shndx].shdr.sh_name;
- }
- else if (shndx == SHN_ABS) {
+ else if (shndx == SHN_ABS)
name = "ABSOLUTE";
- }
- else if (shndx == SHN_COMMON) {
+ else if (shndx == SHN_COMMON)
name = "COMMON";
- }
+
return name;
}
@@ -272,18 +275,19 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym)
{
const char *name;
name = "<noname>";
- if (sym->st_name) {
+
+ if (sym->st_name)
name = sym_strtab + sym->st_name;
- }
- else {
+ else
name = sec_name(sym_index(sym));
- }
+
return name;
}
static Elf_Sym *sym_lookup(const char *symname)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
long nsyms;
@@ -309,14 +313,15 @@ static Elf_Sym *sym_lookup(const char *symname)
}
#if BYTE_ORDER == LITTLE_ENDIAN
-#define le16_to_cpu(val) (val)
-#define le32_to_cpu(val) (val)
-#define le64_to_cpu(val) (val)
+# define le16_to_cpu(val) (val)
+# define le32_to_cpu(val) (val)
+# define le64_to_cpu(val) (val)
#endif
+
#if BYTE_ORDER == BIG_ENDIAN
-#define le16_to_cpu(val) bswap_16(val)
-#define le32_to_cpu(val) bswap_32(val)
-#define le64_to_cpu(val) bswap_64(val)
+# define le16_to_cpu(val) bswap_16(val)
+# define le32_to_cpu(val) bswap_32(val)
+# define le64_to_cpu(val) bswap_64(val)
#endif
static uint16_t elf16_to_cpu(uint16_t val)
@@ -337,13 +342,13 @@ static uint64_t elf64_to_cpu(uint64_t val)
{
return le64_to_cpu(val);
}
-#define elf_addr_to_cpu(x) elf64_to_cpu(x)
-#define elf_off_to_cpu(x) elf64_to_cpu(x)
-#define elf_xword_to_cpu(x) elf64_to_cpu(x)
+# define elf_addr_to_cpu(x) elf64_to_cpu(x)
+# define elf_off_to_cpu(x) elf64_to_cpu(x)
+# define elf_xword_to_cpu(x) elf64_to_cpu(x)
#else
-#define elf_addr_to_cpu(x) elf32_to_cpu(x)
-#define elf_off_to_cpu(x) elf32_to_cpu(x)
-#define elf_xword_to_cpu(x) elf32_to_cpu(x)
+# define elf_addr_to_cpu(x) elf32_to_cpu(x)
+# define elf_off_to_cpu(x) elf32_to_cpu(x)
+# define elf_xword_to_cpu(x) elf32_to_cpu(x)
#endif
static int sym_index(Elf_Sym *sym)
@@ -365,22 +370,17 @@ static int sym_index(Elf_Sym *sym)
static void read_ehdr(FILE *fp)
{
- if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) {
- die("Cannot read ELF header: %s\n",
- strerror(errno));
- }
- if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) {
+ if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+ die("Cannot read ELF header: %s\n", strerror(errno));
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0)
die("No ELF magic\n");
- }
- if (ehdr.e_ident[EI_CLASS] != ELF_CLASS) {
+ if (ehdr.e_ident[EI_CLASS] != ELF_CLASS)
die("Not a %d bit executable\n", ELF_BITS);
- }
- if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) {
+ if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB)
die("Not a LSB ELF executable\n");
- }
- if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) {
+ if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
die("Unknown ELF version\n");
- }
+
/* Convert the fields to native endian */
ehdr.e_type = elf_half_to_cpu(ehdr.e_type);
ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine);
@@ -439,19 +439,18 @@ static void read_shdrs(FILE *fp)
Elf_Shdr shdr;
secs = calloc(shnum, sizeof(struct section));
- if (!secs) {
- die("Unable to allocate %ld section headers\n",
- shnum);
- }
- if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- ehdr.e_shoff, strerror(errno));
- }
+ if (!secs)
+ die("Unable to allocate %ld section headers\n", shnum);
+
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
+
if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
- die("Cannot read ELF section headers %d/%ld: %s\n",
- i, shnum, strerror(errno));
+ die("Cannot read ELF section headers %d/%ld: %s\n", i, shnum, strerror(errno));
+
sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags);
@@ -471,31 +470,28 @@ static void read_shdrs(FILE *fp)
static void read_strtabs(FILE *fp)
{
int i;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_STRTAB) {
+
+ if (sec->shdr.sh_type != SHT_STRTAB)
continue;
- }
+
sec->strtab = malloc(sec->shdr.sh_size);
- if (!sec->strtab) {
- die("malloc of %" FMT " bytes for strtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->strtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->strtab)
+ die("malloc of %" FMT " bytes for strtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
}
}
static void read_symtabs(FILE *fp)
{
- int i,j;
+ int i, j;
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
@@ -504,19 +500,15 @@ static void read_symtabs(FILE *fp)
switch (sec->shdr.sh_type) {
case SHT_SYMTAB_SHNDX:
sec->xsymtab = malloc(sec->shdr.sh_size);
- if (!sec->xsymtab) {
- die("malloc of %" FMT " bytes for xsymtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read extended symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->xsymtab)
+ die("malloc of %" FMT " bytes for xsymtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->xsymtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read extended symbol table: %s\n", strerror(errno));
+
shxsymtabndx = i;
continue;
@@ -524,19 +516,15 @@ static void read_symtabs(FILE *fp)
num_syms = sec->shdr.sh_size / sizeof(Elf_Sym);
sec->symtab = malloc(sec->shdr.sh_size);
- if (!sec->symtab) {
- die("malloc of %" FMT " bytes for symtab failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->symtab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->symtab)
+ die("malloc of %" FMT " bytes for symtab failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < num_syms; j++) {
Elf_Sym *sym = &sec->symtab[j];
@@ -557,28 +545,27 @@ static void read_symtabs(FILE *fp)
static void read_relocs(FILE *fp)
{
- int i,j;
+ int i, j;
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec->reltab = malloc(sec->shdr.sh_size);
- if (!sec->reltab) {
- die("malloc of %" FMT " bytes for relocs failed\n",
- sec->shdr.sh_size);
- }
- if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
- die("Seek to %" FMT " failed: %s\n",
- sec->shdr.sh_offset, strerror(errno));
- }
- if (fread(sec->reltab, 1, sec->shdr.sh_size, fp)
- != sec->shdr.sh_size) {
- die("Cannot read symbol table: %s\n",
- strerror(errno));
- }
+ if (!sec->reltab)
+ die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
+
rel->r_offset = elf_addr_to_cpu(rel->r_offset);
rel->r_info = elf_xword_to_cpu(rel->r_info);
#if (SHT_REL_TYPE == SHT_RELA)
@@ -601,23 +588,27 @@ static void print_absolute_symbols(void)
printf("Absolute symbols\n");
printf(" Num: Value Size Type Bind Visibility Name\n");
+
for (i = 0; i < shnum; i++) {
struct section *sec = &secs[i];
char *sym_strtab;
int j;
- if (sec->shdr.sh_type != SHT_SYMTAB) {
+ if (sec->shdr.sh_type != SHT_SYMTAB)
continue;
- }
+
sym_strtab = sec->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Sym); j++) {
Elf_Sym *sym;
const char *name;
+
sym = &sec->symtab[j];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
+
printf(format,
j, sym->st_value, sym->st_size,
sym_type(ELF_ST_TYPE(sym->st_info)),
@@ -645,34 +636,37 @@ static void print_absolute_relocs(void)
char *sym_strtab;
Elf_Sym *sh_symtab;
int j;
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
/*
* Do not perform relocations in .notes section; any
* values there are meant for pre-boot consumption (e.g.
* startup_xen).
*/
- if (sec_applies->shdr.sh_type == SHT_NOTE) {
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
continue;
- }
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel;
Elf_Sym *sym;
const char *name;
+
rel = &sec->reltab[j];
sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
name = sym_name(sym_strtab, sym);
- if (sym->st_shndx != SHN_ABS) {
+
+ if (sym->st_shndx != SHN_ABS)
continue;
- }
/* Absolute symbols are not relocated if bzImage is
* loaded at a non-compiled address. Display a warning
@@ -691,10 +685,8 @@ static void print_absolute_relocs(void)
continue;
if (!printed) {
- printf("WARNING: Absolute relocations"
- " present\n");
- printf("Offset Info Type Sym.Value "
- "Sym.Name\n");
+ printf("WARNING: Absolute relocations present\n");
+ printf("Offset Info Type Sym.Value Sym.Name\n");
printed = 1;
}
@@ -718,8 +710,8 @@ static void add_reloc(struct relocs *r, uint32_t offset)
void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
if (!mem)
- die("realloc of %ld entries for relocs failed\n",
- newsize);
+ die("realloc of %ld entries for relocs failed\n", newsize);
+
r->offset = mem;
r->size = newsize;
}
@@ -730,6 +722,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
Elf_Sym *sym, const char *symname))
{
int i;
+
/* Walk through the relocations */
for (i = 0; i < shnum; i++) {
char *sym_strtab;
@@ -738,16 +731,25 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
int j;
struct section *sec = &secs[i];
- if (sec->shdr.sh_type != SHT_REL_TYPE) {
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
continue;
- }
+
sec_symtab = sec->link;
sec_applies = &secs[sec->shdr.sh_info];
- if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) {
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
continue;
- }
+
+ /*
+ * Do not perform relocations in .notes sections; any
+ * values there are meant for pre-boot consumption (e.g.
+ * startup_xen).
+ */
+ if (sec_applies->shdr.sh_type == SHT_NOTE)
+ continue;
+
sh_symtab = sec_symtab->symtab;
sym_strtab = sec_symtab->link->strtab;
+
for (j = 0; j < sec->shdr.sh_size/sizeof(Elf_Rel); j++) {
Elf_Rel *rel = &sec->reltab[j];
Elf_Sym *sym = &sh_symtab[ELF_R_SYM(rel->r_info)];
@@ -781,14 +783,16 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
* kernel data and does not require special treatment.
*
*/
-static int per_cpu_shndx = -1;
+static int per_cpu_shndx = -1;
static Elf_Addr per_cpu_load_addr;
static void percpu_init(void)
{
int i;
+
for (i = 0; i < shnum; i++) {
ElfW(Sym) *sym;
+
if (strcmp(sec_name(i), ".data..percpu"))
continue;
@@ -801,6 +805,7 @@ static void percpu_init(void)
per_cpu_shndx = i;
per_cpu_load_addr = sym->st_value;
+
return;
}
}
@@ -871,8 +876,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
* Only used by jump labels
*/
if (is_percpu_sym(sym, symname))
- die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n",
- symname);
+ die("Invalid R_X86_64_PC64 relocation against per-CPU symbol %s\n", symname);
break;
case R_X86_64_32:
@@ -892,8 +896,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -913,8 +916,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -951,8 +953,7 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (is_reloc(S_ABS, symname))
break;
- die("Invalid absolute %s relocation: %s\n",
- rel_type(r_type), symname);
+ die("Invalid absolute %s relocation: %s\n", rel_type(r_type), symname);
break;
}
@@ -960,16 +961,14 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
return 0;
}
-static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname)
+static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)
{
unsigned r_type = ELF32_R_TYPE(rel->r_info);
int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname);
@@ -1004,9 +1003,7 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
if (!is_reloc(S_LIN, symname))
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
case R_386_32:
@@ -1027,14 +1024,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
add_reloc(&relocs32, rel->r_offset);
break;
}
- die("Invalid %s %s relocation: %s\n",
- shn_abs ? "absolute" : "relative",
- rel_type(r_type), symname);
+ die("Invalid %s %s relocation: %s\n", shn_abs ? "absolute" : "relative", rel_type(r_type), symname);
break;
default:
- die("Unsupported relocation type: %s (%d)\n",
- rel_type(r_type), r_type);
+ die("Unsupported relocation type: %s (%d)\n", rel_type(r_type), r_type);
break;
}
@@ -1046,7 +1040,10 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
static int cmp_relocs(const void *va, const void *vb)
{
const uint32_t *a, *b;
- a = va; b = vb;
+
+ a = va;
+ b = vb;
+
return (*a == *b)? 0 : (*a > *b)? 1 : -1;
}
@@ -1060,6 +1057,7 @@ static int write32(uint32_t v, FILE *f)
unsigned char buf[4];
put_unaligned_le32(v, buf);
+
return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
}
@@ -1072,8 +1070,7 @@ static void emit_relocs(int as_text, int use_real_mode)
{
int i;
int (*write_reloc)(uint32_t, FILE *) = write32;
- int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
- const char *symname);
+ int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname);
#if ELF_BITS == 64
if (!use_real_mode)
@@ -1160,6 +1157,7 @@ static int do_reloc_info(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
rel_type(ELF_R_TYPE(rel->r_info)),
symname,
sec_name(sym_index(sym)));
+
return 0;
}
@@ -1185,19 +1183,24 @@ void process(FILE *fp, int use_real_mode, int as_text,
read_strtabs(fp);
read_symtabs(fp);
read_relocs(fp);
+
if (ELF_BITS == 64)
percpu_init();
+
if (show_absolute_syms) {
print_absolute_symbols();
return;
}
+
if (show_absolute_relocs) {
print_absolute_relocs();
return;
}
+
if (show_reloc_info) {
print_reloc_info();
return;
}
+
emit_relocs(as_text, use_real_mode);
}
diff --git a/arch/x86/virt/Makefile b/arch/x86/virt/Makefile
index 1e36502cd738..ea343fc392dc 100644
--- a/arch/x86/virt/Makefile
+++ b/arch/x86/virt/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += vmx/
+obj-y += svm/ vmx/
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index cffe1157a90a..0ae10535c699 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -77,7 +77,7 @@ static int __mfd_enable(unsigned int cpu)
{
u64 val;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return 0;
rdmsrl(MSR_AMD64_SYSCFG, val);
@@ -98,7 +98,7 @@ static int __snp_enable(unsigned int cpu)
{
u64 val;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return 0;
rdmsrl(MSR_AMD64_SYSCFG, val);
@@ -163,6 +163,42 @@ bool snp_probe_rmptable_info(void)
return true;
}
+static void __init __snp_fixup_e820_tables(u64 pa)
+{
+ if (IS_ALIGNED(pa, PMD_SIZE))
+ return;
+
+ /*
+ * Handle cases where the RMP table placement by the BIOS is not
+ * 2M aligned and the kexec kernel could try to allocate
+ * from within that chunk which then causes a fatal RMP fault.
+ *
+ * The e820_table needs to be updated as it is converted to
+ * kernel memory resources and used by KEXEC_FILE_LOAD syscall
+ * to load kexec segments.
+ *
+ * The e820_table_firmware needs to be updated as it is exposed
+ * to sysfs and used by the KEXEC_LOAD syscall to load kexec
+ * segments.
+ *
+ * The e820_table_kexec needs to be updated as it passed to
+ * the kexec-ed kernel.
+ */
+ pa = ALIGN_DOWN(pa, PMD_SIZE);
+ if (e820__mapped_any(pa, pa + PMD_SIZE, E820_TYPE_RAM)) {
+ pr_info("Reserving start/end of RMP table on a 2MB boundary [0x%016llx]\n", pa);
+ e820__range_update(pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_kexec, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_firmware, pa, PMD_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ }
+}
+
+void __init snp_fixup_e820_tables(void)
+{
+ __snp_fixup_e820_tables(probed_rmp_base);
+ __snp_fixup_e820_tables(probed_rmp_base + probed_rmp_size);
+}
+
/*
* Do the necessary preparations which are verified by the firmware as
* described in the SNP_INIT_EX firmware command description in the SNP
@@ -174,11 +210,11 @@ static int __init snp_rmptable_init(void)
u64 rmptable_size;
u64 val;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return 0;
if (!amd_iommu_snp_en)
- return 0;
+ goto nosnp;
if (!probed_rmp_size)
goto nosnp;
@@ -225,7 +261,7 @@ skip_enable:
return 0;
nosnp:
- setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
return -ENOSYS;
}
@@ -246,7 +282,7 @@ static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
{
struct rmpentry *large_entry, *entry;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return ERR_PTR(-ENODEV);
entry = get_rmpentry(pfn);
@@ -363,7 +399,7 @@ int psmash(u64 pfn)
unsigned long paddr = pfn << PAGE_SHIFT;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return -ENODEV;
if (!pfn_valid(pfn))
@@ -472,7 +508,7 @@ static int rmpupdate(u64 pfn, struct rmp_state *state)
unsigned long paddr = pfn << PAGE_SHIFT;
int ret, level;
- if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
return -ENODEV;
level = RMP_TO_PG_LEVEL(state->pagesize);
@@ -558,3 +594,13 @@ void snp_leak_pages(u64 pfn, unsigned int npages)
spin_unlock(&snp_leaked_pages_list_lock);
}
EXPORT_SYMBOL_GPL(snp_leak_pages);
+
+void kdump_sev_callback(void)
+{
+ /*
+ * Do wbinvd() on remote CPUs when SNP is enabled in order to
+ * safely do SNP_SHUTDOWN on the local CPU.
+ */
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ wbinvd();
+}
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 4d6826a76f78..49a1c6890b55 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -27,7 +27,6 @@
#include <linux/log2.h>
#include <linux/acpi.h>
#include <linux/suspend.h>
-#include <linux/acpi.h>
#include <asm/page.h>
#include <asm/special_insns.h>
#include <asm/msr-index.h>
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index a65fc2ae15b4..77e788e928cd 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -81,7 +81,6 @@ config XEN_PVH
bool "Xen PVH guest support"
depends on XEN && XEN_PVHVM && ACPI
select PVH
- def_bool n
help
Support for running as a Xen PVH guest.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3c61bb98c10e..5f3a69f6ec34 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,11 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-#include <linux/memblock.h>
-#endif
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/panic_notifier.h>
@@ -350,3 +348,34 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
+{
+ unsigned int i;
+
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 99a68fa71dbe..c001a2296582 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -148,7 +148,9 @@ static void xen_hvm_shutdown(void)
if (kexec_in_progress)
xen_reboot(SHUTDOWN_soft_reset);
}
+#endif
+#ifdef CONFIG_CRASH_DUMP
static void xen_hvm_crash_shutdown(struct pt_regs *regs)
{
native_machine_crash_shutdown(regs);
@@ -236,6 +238,8 @@ static void __init xen_hvm_guest_init(void)
#ifdef CONFIG_KEXEC_CORE
machine_ops.shutdown = xen_hvm_shutdown;
+#endif
+#ifdef CONFIG_CRASH_DUMP
machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
#endif
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ace2eb054053..9ba53814ed6a 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -219,13 +219,21 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
- unsigned maskebx = ~0;
+ unsigned int maskebx = ~0;
+ unsigned int or_ebx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
+ case 0x1:
+ /* Replace initial APIC ID in bits 24-31 of EBX. */
+ /* See xen_pv_smp_config() for related topology preparations. */
+ maskebx = 0x00ffffff;
+ or_ebx = smp_processor_id() << 24;
+ break;
+
case CPUID_MWAIT_LEAF:
/* Synthesize the values.. */
*ax = 0;
@@ -248,6 +256,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
: "0" (*ax), "2" (*cx));
*bx &= maskebx;
+ *bx |= or_ebx;
}
static bool __init xen_check_mwait(void)
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index 9e9db601bd52..27a2a02ef8fb 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/mm.h>
#include <xen/hvc-console.h>
@@ -73,3 +74,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
}
boot_params_p->e820_entries = memmap.nr_entries;
}
+
+/*
+ * Reserve e820 UNUSABLE regions to inflate the memory balloon.
+ *
+ * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
+ * located as the same place as in the native memory map, but since dom0 gets
+ * less memory than the total amount of host RAM the ranges that can't be
+ * populated are converted from RAM -> UNUSABLE. Use such regions (up to the
+ * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
+ * boot. Doing so prevents the guest (even if just temporary) from using holes
+ * in the memory map in order to map grants or foreign addresses, and
+ * hopefully limits the risk of a clash with a device MMIO region. Ideally the
+ * hypervisor should notify us which memory ranges are suitable for creating
+ * foreign mappings, but that's not yet implemented.
+ */
+void __init xen_reserve_extra_memory(struct boot_params *bootp)
+{
+ unsigned int i, ram_pages = 0, extra_pages;
+
+ for (i = 0; i < bootp->e820_entries; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+
+ if (e->type != E820_TYPE_RAM)
+ continue;
+ ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
+ }
+
+ /* Max amount of extra memory. */
+ extra_pages = EXTRA_MEM_RATIO * ram_pages;
+
+ /*
+ * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
+ * purposes.
+ */
+ for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+ unsigned long pages;
+
+ if (e->type != E820_TYPE_UNUSABLE)
+ continue;
+
+ pages = min(extra_pages,
+ PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
+
+ if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
+ struct boot_e820_entry *next;
+
+ if (bootp->e820_entries ==
+ ARRAY_SIZE(bootp->e820_table))
+ /* No space left to split - skip region. */
+ continue;
+
+ /* Split entry. */
+ next = e + 1;
+ memmove(next, e,
+ (bootp->e820_entries - i) * sizeof(*e));
+ bootp->e820_entries++;
+ next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
+ e->size = next->addr - e->addr;
+ next->size -= e->size;
+ }
+ e->type = E820_TYPE_RAM;
+ extra_pages -= pages;
+
+ xen_add_extra_mem(PFN_UP(e->addr), pages);
+ }
+}
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 72af496a160c..54e0d311dcc9 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -913,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
struct mm_struct *mm = info;
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
+ leave_mm();
/*
* If this cpu still has a stale cr3 reference, then make sure
@@ -1059,7 +1059,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
pte_t *pte_tbl;
int i;
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
return;
@@ -1082,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
pmd_t *pmd_tbl;
int i;
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
return;
@@ -1104,7 +1104,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
pud_t *pud_tbl;
int i;
- if (p4d_large(*p4d)) {
+ if (p4d_leaf(*p4d)) {
pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, P4D_SIZE);
return;
@@ -1863,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pud_present(pud))
return 0;
pa = pud_val(pud) & PTE_PFN_MASK;
- if (pud_large(pud))
+ if (pud_leaf(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
@@ -1871,7 +1871,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pmd_present(pmd))
return 0;
pa = pmd_val(pmd) & PTE_PFN_MASK;
- if (pmd_large(pmd))
+ if (pmd_leaf(pmd))
return pa + (vaddr & ~PMD_MASK);
pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
@@ -2520,7 +2520,7 @@ out:
}
EXPORT_SYMBOL_GPL(xen_remap_pfn);
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_VMCORE_INFO
phys_addr_t paddr_vmcoreinfo_note(void)
{
if (xen_pv_domain())
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b3e37961065a..380591028cb8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -38,9 +38,6 @@
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
-/* Amount of extra memory space we add to the e820 ranges */
-struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
@@ -64,18 +61,6 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
-/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO (10)
-
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -96,35 +81,6 @@ static void __init xen_parse_512gb(void)
xen_512gb_limit = val;
}
-static void __init xen_add_extra_mem(unsigned long start_pfn,
- unsigned long n_pfns)
-{
- int i;
-
- /*
- * No need to check for zero size, should happen rarely and will only
- * write a new entry regarded to be unused due to zero size.
- */
- for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- /* Add new region. */
- if (xen_extra_mem[i].n_pfns == 0) {
- xen_extra_mem[i].start_pfn = start_pfn;
- xen_extra_mem[i].n_pfns = n_pfns;
- break;
- }
- /* Append to existing region. */
- if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
- start_pfn) {
- xen_extra_mem[i].n_pfns += n_pfns;
- break;
- }
- }
- if (i == XEN_EXTRA_MEM_MAX_REGIONS)
- printk(KERN_WARNING "Warning: not enough extra memory regions\n");
-
- memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
-}
-
static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 27d1a5b7f571..ac41d83b38d3 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -154,9 +154,9 @@ static void __init xen_pv_smp_config(void)
u32 apicid = 0;
int i;
- topology_register_boot_apic(apicid++);
+ topology_register_boot_apic(apicid);
- for (i = 1; i < nr_cpu_ids; i++)
+ for (i = 0; i < nr_cpu_ids; i++)
topology_register_apic(apicid++, CPU_ACPIID_INVALID, true);
/* Pretend to be a proper enumerated system */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 04101b984f24..758bcd47b72d 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -49,7 +49,7 @@ SYM_CODE_START(startup_xen)
ANNOTATE_NOENDBR
cld
- leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Set up %gs.
*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index a87ab36889e7..79cf93f2c92f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -163,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled);
static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
#endif
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+
#endif /* XEN_OPS_H */