summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig77
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/Makefile1
-rw-r--r--arch/x86/boot/boot.h73
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/acpi.c176
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c3
-rw-r--r--arch/x86/boot/compressed/efi.c234
-rw-r--r--arch/x86/boot/compressed/efi.h126
-rw-r--r--arch/x86/boot/compressed/head_64.S64
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c39
-rw-r--r--arch/x86/boot/compressed/idt_64.c18
-rw-r--r--arch/x86/boot/compressed/kaslr.c3
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S36
-rw-r--r--arch/x86/boot/compressed/misc.c17
-rw-r--r--arch/x86/boot/compressed/misc.h60
-rw-r--r--arch/x86/boot/compressed/pgtable.h2
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c3
-rw-r--r--arch/x86/boot/compressed/sev.c263
-rw-r--r--arch/x86/boot/compressed/tdcall.S3
-rw-r--r--arch/x86/boot/compressed/tdx.c77
-rw-r--r--arch/x86/boot/compressed/tdx.h13
-rw-r--r--arch/x86/boot/cpucheck.c30
-rw-r--r--arch/x86/boot/cpuflags.c3
-rw-r--r--arch/x86/boot/cpuflags.h1
-rw-r--r--arch/x86/boot/header.S4
-rw-r--r--arch/x86/boot/io.h41
-rw-r--r--arch/x86/boot/main.c6
-rw-r--r--arch/x86/boot/msr.h26
-rw-r--r--arch/x86/coco/Makefile2
-rw-r--r--arch/x86/coco/core.c25
-rw-r--r--arch/x86/coco/tdx/Makefile3
-rw-r--r--arch/x86/coco/tdx/tdcall.S205
-rw-r--r--arch/x86/coco/tdx/tdx.c692
-rw-r--r--arch/x86/entry/calling.h19
-rw-r--r--arch/x86/entry/entry_64.S65
-rw-r--r--arch/x86/entry/entry_64_compat.S111
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/events/Kconfig8
-rw-r--r--arch/x86/events/amd/Makefile1
-rw-r--r--arch/x86/events/amd/brs.c367
-rw-r--r--arch/x86/events/amd/core.c505
-rw-r--r--arch/x86/events/amd/ibs.c209
-rw-r--r--arch/x86/events/core.c12
-rw-r--r--arch/x86/events/intel/core.c11
-rw-r--r--arch/x86/events/intel/cstate.c25
-rw-r--r--arch/x86/events/intel/lbr.c36
-rw-r--r--arch/x86/events/intel/uncore.c3
-rw-r--r--arch/x86/events/intel/uncore_snb.c442
-rw-r--r--arch/x86/events/msr.c3
-rw-r--r--arch/x86/events/perf_event.h125
-rw-r--r--arch/x86/ia32/Makefile2
-rw-r--r--arch/x86/ia32/ia32_aout.c325
-rw-r--r--arch/x86/include/asm/acenv.h14
-rw-r--r--arch/x86/include/asm/amd-ibs.h2
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apic.h7
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/asm.h20
-rw-r--r--arch/x86/include/asm/bootparam_utils.h1
-rw-r--r--arch/x86/include/asm/bug.h6
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h21
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h6
-rw-r--r--arch/x86/include/asm/compat.h6
-rw-r--r--arch/x86/include/asm/cpu.h22
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h6
-rw-r--r--arch/x86/include/asm/cpuid.h34
-rw-r--r--arch/x86/include/asm/disabled-features.h16
-rw-r--r--arch/x86/include/asm/efi.h5
-rw-r--r--arch/x86/include/asm/elf.h15
-rw-r--r--arch/x86/include/asm/entry-common.h4
-rw-r--r--arch/x86/include/asm/fpu/api.h3
-rw-r--r--arch/x86/include/asm/fpu/internal.h0
-rw-r--r--arch/x86/include/asm/highmem.h1
-rw-r--r--arch/x86/include/asm/idtentry.h4
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/io.h44
-rw-r--r--arch/x86/include/asm/irqflags.h8
-rw-r--r--arch/x86/include/asm/jump_label.h6
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/include/asm/kvm_para.h22
-rw-r--r--arch/x86/include/asm/mem_encrypt.h6
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mmx.h0
-rw-r--r--arch/x86/include/asm/msi.h19
-rw-r--r--arch/x86/include/asm/msr-index.h22
-rw-r--r--arch/x86/include/asm/msr.h11
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h9
-rw-r--r--arch/x86/include/asm/percpu.h6
-rw-r--r--arch/x86/include/asm/perf_event.h48
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/pkeys.h8
-rw-r--r--arch/x86/include/asm/proto.h8
-rw-r--r--arch/x86/include/asm/ptrace.h4
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/segment.h12
-rw-r--r--arch/x86/include/asm/setup.h31
-rw-r--r--arch/x86/include/asm/sev-common.h82
-rw-r--r--arch/x86/include/asm/sev.h137
-rw-r--r--arch/x86/include/asm/shared/io.h34
-rw-r--r--arch/x86/include/asm/shared/msr.h15
-rw-r--r--arch/x86/include/asm/shared/tdx.h40
-rw-r--r--arch/x86/include/asm/smap.h24
-rw-r--r--arch/x86/include/asm/special_insns.h7
-rw-r--r--arch/x86/include/asm/static_call.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h12
-rw-r--r--arch/x86/include/asm/svm.h179
-rw-r--r--arch/x86/include/asm/tdx.h91
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/timex.h9
-rw-r--r--arch/x86/include/asm/topology.h23
-rw-r--r--arch/x86/include/asm/traps.h2
-rw-r--r--arch/x86/include/asm/tsc.h7
-rw-r--r--arch/x86/include/uapi/asm/amd_hsmp.h114
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h4
-rw-r--r--arch/x86/include/uapi/asm/svm.h13
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c93
-rw-r--r--arch/x86/kernel/acpi/cppc.c29
-rw-r--r--arch/x86/kernel/alternative.c6
-rw-r--r--arch/x86/kernel/amd_nb.c7
-rw-r--r--arch/x86/kernel/apic/apic.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c18
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c17
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c480
-rw-r--r--arch/x86/kernel/cpu/bugs.c7
-rw-r--r--arch/x86/kernel/cpu/common.c105
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c117
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c32
-rw-r--r--arch/x86/kernel/cpu/mce/core.c4
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c110
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c6
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c59
-rw-r--r--arch/x86/kernel/cpu/proc.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c14
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c113
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c13
-rw-r--r--arch/x86/kernel/cpu/tsx.c104
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/crash_dump_64.c1
-rw-r--r--arch/x86/kernel/fpu/core.c67
-rw-r--r--arch/x86/kernel/fpu/xstate.c65
-rw-r--r--arch/x86/kernel/fpu/xstate.h14
-rw-r--r--arch/x86/kernel/ftrace.c17
-rw-r--r--arch/x86/kernel/head64.c36
-rw-r--r--arch/x86/kernel/head_64.S65
-rw-r--r--arch/x86/kernel/idt.c3
-rw-r--r--arch/x86/kernel/kvm.c13
-rw-r--r--arch/x86/kernel/nmi.c12
-rw-r--r--arch/x86/kernel/probe_roms.c13
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c13
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c6
-rw-r--r--arch/x86/kernel/setup.c28
-rw-r--r--arch/x86/kernel/sev-shared.c534
-rw-r--r--arch/x86/kernel/sev.c855
-rw-r--r--arch/x86/kernel/signal.c8
-rw-r--r--arch/x86/kernel/signal_compat.c2
-rw-r--r--arch/x86/kernel/smpboot.c373
-rw-r--r--arch/x86/kernel/static_call.c5
-rw-r--r--arch/x86/kernel/sys_x86_64.c7
-rw-r--r--arch/x86/kernel/traps.c162
-rw-r--r--arch/x86/kernel/unwind_orc.c8
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kvm/cpuid.c43
-rw-r--r--arch/x86/kvm/hyperv.c44
-rw-r--r--arch/x86/kvm/hyperv.h2
-rw-r--r--arch/x86/kvm/mmu.h24
-rw-r--r--arch/x86/kvm/mmu/mmu.c127
-rw-r--r--arch/x86/kvm/mmu/spte.c28
-rw-r--r--arch/x86/kvm/mmu/spte.h10
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h34
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c99
-rw-r--r--arch/x86/kvm/pmu.c7
-rw-r--r--arch/x86/kvm/pmu.h9
-rw-r--r--arch/x86/kvm/svm/avic.c3
-rw-r--r--arch/x86/kvm/svm/pmu.c29
-rw-r--r--arch/x86/kvm/svm/sev.c134
-rw-r--r--arch/x86/kvm/svm/svm.c9
-rw-r--r--arch/x86/kvm/svm/svm.h6
-rw-r--r--arch/x86/kvm/vmx/nested.c5
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c7
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/x86.c95
-rw-r--r--arch/x86/lib/copy_user_64.S87
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/lib/insn-eval.c5
-rw-r--r--arch/x86/lib/kaslr.c2
-rw-r--r--arch/x86/lib/mmx_32.c0
-rw-r--r--arch/x86/lib/putuser.S4
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/math-emu/get_address.c2
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/amdtopology.c2
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init_64.c6
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/mem_encrypt.c13
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c71
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c8
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa_emulation.c4
-rw-r--r--arch/x86/mm/pat/memtype.c2
-rw-r--r--arch/x86/mm/pat/set_memory.c11
-rw-r--r--arch/x86/mm/pti.c2
-rw-r--r--arch/x86/mm/setup_nx.c62
-rw-r--r--arch/x86/mm/tlb.c37
-rw-r--r--arch/x86/net/bpf_jit_comp.c1
-rw-r--r--arch/x86/pci/irq.c377
-rw-r--r--arch/x86/pci/xen.c6
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--arch/x86/platform/pvh/head.S1
-rw-r--r--arch/x86/platform/uv/uv_nmi.c23
-rw-r--r--arch/x86/power/cpu.c31
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/realmode/rm/header.S1
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S57
-rw-r--r--arch/x86/realmode/rm/trampoline_common.S12
-rw-r--r--arch/x86/realmode/rm/wakemain.c4
-rw-r--r--arch/x86/virt/vmx/tdx/tdxcall.S96
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/smp_pv.c5
-rw-r--r--arch/x86/xen/xen-head.S1
239 files changed, 8368 insertions, 2963 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b0142e01002e..762a0b6ab8b6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -188,7 +188,7 @@ config X86
select HAVE_CONTEXT_TRACKING if X86_64
select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
- select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION
+ select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
@@ -212,6 +212,7 @@ config X86
select HAVE_IOREMAP_PROT
select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZ4
@@ -230,7 +231,10 @@ config X86
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_MOVE_PMD
select HAVE_MOVE_PUD
+ select HAVE_NOINSTR_HACK if HAVE_OBJTOOL
select HAVE_NMI
+ select HAVE_NOINSTR_VALIDATION if HAVE_OBJTOOL
+ select HAVE_OBJTOOL if X86_64
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
@@ -239,17 +243,17 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
+ select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_SETUP_PER_CPU_AREA
select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
- select HAVE_STACK_VALIDATION if X86_64
+ select HAVE_STACK_VALIDATION if HAVE_OBJTOOL
select HAVE_STATIC_CALL
- select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
select HAVE_PREEMPT_DYNAMIC_CALL
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
@@ -268,7 +272,6 @@ config X86
select RTC_MC146818_LIB
select SPARSE_IRQ
select SRCU
- select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select TRACE_IRQFLAGS_SUPPORT
@@ -459,6 +462,7 @@ config GOLDFISH
config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
+ select OBJTOOL if HAVE_OBJTOOL
default y
help
Compile kernel with the retpoline compiler options to guard against
@@ -472,6 +476,7 @@ config CC_HAS_SLS
config SLS
bool "Mitigate Straight-Line-Speculation"
depends on CC_HAS_SLS && X86_64
+ select OBJTOOL if HAVE_OBJTOOL
default n
help
Compile the kernel with straight-line-speculation options to guard
@@ -878,6 +883,21 @@ config ACRN_GUEST
IOT with small footprint and real-time features. More details can be
found in https://projectacrn.org/.
+config INTEL_TDX_GUEST
+ bool "Intel TDX (Trust Domain Extensions) - Guest Support"
+ depends on X86_64 && CPU_SUP_INTEL
+ depends on X86_X2APIC
+ select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
+ select X86_MCE
+ help
+ Support running as a guest under Intel TDX. Without this support,
+ the guest kernel can not boot or run under TDX.
+ TDX includes memory encryption and integrity capabilities
+ which protect the confidentiality and integrity of guest
+ memory contents and CPU state. TDX guests are protected from
+ some attacks from the VMM.
+
endif #HYPERVISOR_GUEST
source "arch/x86/Kconfig.cpu"
@@ -1313,7 +1333,7 @@ config MICROCODE
config MICROCODE_INTEL
bool "Intel microcode loading support"
- depends on MICROCODE
+ depends on CPU_SUP_INTEL && MICROCODE
default MICROCODE
help
This options enables microcode patch loading support for Intel
@@ -1325,7 +1345,7 @@ config MICROCODE_INTEL
config MICROCODE_AMD
bool "AMD microcode loading support"
- depends on MICROCODE
+ depends on CPU_SUP_AMD && MICROCODE
help
If you select this option, microcode patch loading support for AMD
processors will be enabled.
@@ -1816,17 +1836,6 @@ config ARCH_RANDOM
If supported, this is a high bandwidth, cryptographically
secure hardware random number generator.
-config X86_SMAP
- def_bool y
- prompt "Supervisor Mode Access Prevention" if EXPERT
- help
- Supervisor Mode Access Prevention (SMAP) is a security
- feature in newer Intel processors. There is a small
- performance cost if this enabled and turned on; there is
- also a small increase in the kernel size if this is enabled.
-
- If unsure, say Y.
-
config X86_UMIP
def_bool y
prompt "User Mode Instruction Prevention" if EXPERT
@@ -1855,9 +1864,10 @@ config CC_HAS_IBT
config X86_KERNEL_IBT
prompt "Indirect Branch Tracking"
bool
- depends on X86_64 && CC_HAS_IBT && STACK_VALIDATION
+ depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
depends on !LD_IS_LLD || LLD_VERSION >= 140000
+ select OBJTOOL
help
Build the kernel with support for Indirect Branch Tracking, a
hardware support course-grain forward-edge Control Flow Integrity
@@ -1866,7 +1876,7 @@ config X86_KERNEL_IBT
code with them to make this happen.
In addition to building the kernel with IBT, seal all functions that
- are not indirect call targets, avoiding them ever becomming one.
+ are not indirect call targets, avoiding them ever becoming one.
This requires LTO like objtool runs and will slow down the build. It
does significantly reduce the number of ENDBR instructions in the
@@ -2326,7 +2336,9 @@ choice
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
- line parameter vsyscall=[emulate|xonly|none].
+ line parameter vsyscall=[emulate|xonly|none]. Emulate mode
+ is deprecated and can only be enabled using the kernel command
+ line.
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
@@ -2334,20 +2346,6 @@ choice
If unsure, select "Emulate execution only".
- config LEGACY_VSYSCALL_EMULATE
- bool "Full emulation"
- help
- The kernel traps and emulates calls into the fixed vsyscall
- address mapping. This makes the mapping non-executable, but
- it still contains readable known contents, which could be
- used in certain rare security vulnerability exploits. This
- configuration is recommended when using legacy userspace
- that still uses vsyscalls along with legacy binary
- instrumentation tools that require code to be readable.
-
- An example of this type of legacy userspace is running
- Pin on an old binary that still uses vsyscalls.
-
config LEGACY_VSYSCALL_XONLY
bool "Emulate execution only"
help
@@ -2838,13 +2836,6 @@ config IA32_EMULATION
64-bit kernel. You should likely turn this on, unless you're
100% sure that you don't have any 32-bit programs left.
-config IA32_AOUT
- tristate "IA32 a.out support"
- depends on IA32_EMULATION
- depends on BROKEN
- help
- Support old a.out binaries in the 32bit emulation.
-
config X86_X32_ABI
bool "x32 ABI for 64-bit mode"
depends on X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d3a6f74a94bd..d872a7522e55 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -237,7 +237,7 @@ choice
config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
- select STACK_VALIDATION
+ select OBJTOOL
help
This option enables the ORC (Oops Rewind Capability) unwinder for
unwinding kernel stack traces. It uses a custom data format which is
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 63d50f65b828..1abd7cc9d6cd 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -313,5 +313,6 @@ define archhelp
echo ''
echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+ echo ' x86_debug.config - Enable tip tree debugging options for testing'
endef
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 34c9dbb6a47d..148ba5c5106e 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -26,6 +26,7 @@
#include "bitops.h"
#include "ctype.h"
#include "cpuflags.h"
+#include "io.h"
/* Useful macros */
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
@@ -35,44 +36,10 @@ extern struct boot_params boot_params;
#define cpu_relax() asm volatile("rep; nop")
-/* Basic port I/O */
-static inline void outb(u8 v, u16 port)
-{
- asm volatile("outb %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u8 inb(u16 port)
-{
- u8 v;
- asm volatile("inb %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outw(u16 v, u16 port)
-{
- asm volatile("outw %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u16 inw(u16 port)
-{
- u16 v;
- asm volatile("inw %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
-static inline void outl(u32 v, u16 port)
-{
- asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
-}
-static inline u32 inl(u16 port)
-{
- u32 v;
- asm volatile("inl %1,%0" : "=a" (v) : "dN" (port));
- return v;
-}
-
static inline void io_delay(void)
{
const u16 DELAY_PORT = 0x80;
- asm volatile("outb %%al,%0" : : "dN" (DELAY_PORT));
+ outb(0, DELAY_PORT);
}
/* These functions are used to reference data in other segments. */
@@ -110,66 +77,78 @@ typedef unsigned int addr_t;
static inline u8 rdfs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdfs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdfs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%fs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrfs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%fs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrfs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrfs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%fs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline u8 rdgs8(addr_t addr)
{
+ u8 *ptr = (u8 *)absolute_pointer(addr);
u8 v;
- asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr));
+ asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*ptr));
return v;
}
static inline u16 rdgs16(addr_t addr)
{
+ u16 *ptr = (u16 *)absolute_pointer(addr);
u16 v;
- asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*(u16 *)addr));
+ asm volatile("movw %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline u32 rdgs32(addr_t addr)
{
+ u32 *ptr = (u32 *)absolute_pointer(addr);
u32 v;
- asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*(u32 *)addr));
+ asm volatile("movl %%gs:%1,%0" : "=r" (v) : "m" (*ptr));
return v;
}
static inline void wrgs8(u8 v, addr_t addr)
{
- asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v));
+ u8 *ptr = (u8 *)absolute_pointer(addr);
+ asm volatile("movb %1,%%gs:%0" : "+m" (*ptr) : "qi" (v));
}
static inline void wrgs16(u16 v, addr_t addr)
{
- asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v));
+ u16 *ptr = (u16 *)absolute_pointer(addr);
+ asm volatile("movw %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
static inline void wrgs32(u32 v, addr_t addr)
{
- asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v));
+ u32 *ptr = (u32 *)absolute_pointer(addr);
+ asm volatile("movl %1,%%gs:%0" : "+m" (*ptr) : "ri" (v));
}
/* Note: these only return true/false, not a signed return value! */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 6115274fe10f..19e1905dcbf6 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -101,8 +101,10 @@ ifdef CONFIG_X86_64
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
+vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 8bcbcee54aa1..9caf89063e77 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -3,10 +3,9 @@
#include "misc.h"
#include "error.h"
#include "../string.h"
+#include "efi.h"
#include <linux/numa.h>
-#include <linux/efi.h>
-#include <asm/efi.h>
/*
* Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
@@ -20,153 +19,56 @@
*/
struct mem_vector immovable_mem[MAX_NUMNODES*2];
-/*
- * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and
- * ACPI_TABLE_GUID are found, take the former, which has more features.
- */
static acpi_physical_address
-__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables,
- bool efi_64)
+__efi_get_rsdp_addr(unsigned long cfg_tbl_pa, unsigned int cfg_tbl_len)
{
- acpi_physical_address rsdp_addr = 0;
-
#ifdef CONFIG_EFI
- int i;
-
- /* Get EFI tables from systab. */
- for (i = 0; i < nr_tables; i++) {
- acpi_physical_address table;
- efi_guid_t guid;
-
- if (efi_64) {
- efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i;
-
- guid = tbl->guid;
- table = tbl->table;
-
- if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
- debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n");
- return 0;
- }
- } else {
- efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i;
-
- guid = tbl->guid;
- table = tbl->table;
- }
+ unsigned long rsdp_addr;
+ int ret;
- if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
- rsdp_addr = table;
- else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
- return table;
- }
+ /*
+ * Search EFI system tables for RSDP. Preferred is ACPI_20_TABLE_GUID to
+ * ACPI_TABLE_GUID because it has more features.
+ */
+ rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_20_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ /* No ACPI_20_TABLE_GUID found, fallback to ACPI_TABLE_GUID. */
+ rsdp_addr = efi_find_vendor_table(boot_params, cfg_tbl_pa, cfg_tbl_len,
+ ACPI_TABLE_GUID);
+ if (rsdp_addr)
+ return (acpi_physical_address)rsdp_addr;
+
+ debug_putstr("Error getting RSDP address.\n");
#endif
- return rsdp_addr;
-}
-
-/* EFI/kexec support is 64-bit only. */
-#ifdef CONFIG_X86_64
-static struct efi_setup_data *get_kexec_setup_data_addr(void)
-{
- struct setup_data *data;
- u64 pa_data;
-
- pa_data = boot_params->hdr.setup_data;
- while (pa_data) {
- data = (struct setup_data *)pa_data;
- if (data->type == SETUP_EFI)
- return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
-
- pa_data = data->next;
- }
- return NULL;
-}
-
-static acpi_physical_address kexec_get_rsdp_addr(void)
-{
- efi_system_table_64_t *systab;
- struct efi_setup_data *esd;
- struct efi_info *ei;
- char *sig;
-
- esd = (struct efi_setup_data *)get_kexec_setup_data_addr();
- if (!esd)
- return 0;
-
- if (!esd->tables) {
- debug_putstr("Wrong kexec SETUP_EFI data.\n");
- return 0;
- }
-
- ei = &boot_params->efi_info;
- sig = (char *)&ei->efi_loader_signature;
- if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
- debug_putstr("Wrong kexec EFI loader signature.\n");
- return 0;
- }
-
- /* Get systab from boot params. */
- systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32));
- if (!systab)
- error("EFI system table not found in kexec boot_params.");
-
- return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true);
+ return 0;
}
-#else
-static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; }
-#endif /* CONFIG_X86_64 */
static acpi_physical_address efi_get_rsdp_addr(void)
{
#ifdef CONFIG_EFI
- unsigned long systab, config_tables;
+ unsigned long cfg_tbl_pa = 0;
+ unsigned int cfg_tbl_len;
+ unsigned long systab_pa;
unsigned int nr_tables;
- struct efi_info *ei;
- bool efi_64;
- char *sig;
-
- ei = &boot_params->efi_info;
- sig = (char *)&ei->efi_loader_signature;
-
- if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
- efi_64 = true;
- } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
- efi_64 = false;
- } else {
- debug_putstr("Wrong EFI loader signature.\n");
- return 0;
- }
+ enum efi_type et;
+ int ret;
- /* Get systab from boot params. */
-#ifdef CONFIG_X86_64
- systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
-#else
- if (ei->efi_systab_hi || ei->efi_memmap_hi) {
- debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n");
+ et = efi_get_type(boot_params);
+ if (et == EFI_TYPE_NONE)
return 0;
- }
- systab = ei->efi_systab;
-#endif
- if (!systab)
- error("EFI system table not found.");
- /* Handle EFI bitness properly */
- if (efi_64) {
- efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab;
+ systab_pa = efi_get_system_table(boot_params);
+ if (!systab_pa)
+ error("EFI support advertised, but unable to locate system table.");
- config_tables = stbl->tables;
- nr_tables = stbl->nr_tables;
- } else {
- efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab;
+ ret = efi_get_conf_table(boot_params, &cfg_tbl_pa, &cfg_tbl_len);
+ if (ret || !cfg_tbl_pa)
+ error("EFI config table not found.");
- config_tables = stbl->tables;
- nr_tables = stbl->nr_tables;
- }
-
- if (!config_tables)
- error("EFI config tables not found.");
-
- return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64);
+ return __efi_get_rsdp_addr(cfg_tbl_pa, cfg_tbl_len);
#else
return 0;
#endif
@@ -256,14 +158,6 @@ acpi_physical_address get_rsdp_addr(void)
pa = boot_params->acpi_rsdp_addr;
- /*
- * Try to get EFI data from setup_data. This can happen when we're a
- * kexec'ed kernel and kexec(1) has passed all the required EFI info to
- * us.
- */
- if (!pa)
- pa = kexec_get_rsdp_addr();
-
if (!pa)
pa = efi_get_rsdp_addr();
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index 261e81fb9582..70a8d1706d0f 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,5 +1,6 @@
#include "misc.h"
-int early_serial_base;
+/* This might be accessed before .bss is cleared, so use .data instead. */
+int early_serial_base __section(".data");
#include "../early_serial_console.c"
diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c
new file mode 100644
index 000000000000..6edd034b0b30
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helpers for early access to EFI configuration table.
+ *
+ * Originally derived from arch/x86/boot/compressed/acpi.c
+ */
+
+#include "misc.h"
+
+/**
+ * efi_get_type - Given a pointer to boot_params, determine the type of EFI environment.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI_TYPE_{32,64} for valid EFI environments, EFI_TYPE_NONE otherwise.
+ */
+enum efi_type efi_get_type(struct boot_params *bp)
+{
+ struct efi_info *ei;
+ enum efi_type et;
+ const char *sig;
+
+ ei = &bp->efi_info;
+ sig = (char *)&ei->efi_loader_signature;
+
+ if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_64;
+ } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+ et = EFI_TYPE_32;
+ } else {
+ debug_putstr("No EFI environment detected.\n");
+ et = EFI_TYPE_NONE;
+ }
+
+#ifndef CONFIG_X86_64
+ /*
+ * Existing callers like acpi.c treat this case as an indicator to
+ * fall-through to non-EFI, rather than an error, so maintain that
+ * functionality here as well.
+ */
+ if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+ debug_putstr("EFI system table is located above 4GB and cannot be accessed.\n");
+ et = EFI_TYPE_NONE;
+ }
+#endif
+
+ return et;
+}
+
+/**
+ * efi_get_system_table - Given a pointer to boot_params, retrieve the physical address
+ * of the EFI system table.
+ *
+ * @bp: pointer to boot_params
+ *
+ * Return: EFI system table address on success. On error, return 0.
+ */
+unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ unsigned long sys_tbl_pa;
+ struct efi_info *ei;
+ enum efi_type et;
+
+ /* Get systab from boot params. */
+ ei = &bp->efi_info;
+#ifdef CONFIG_X86_64
+ sys_tbl_pa = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+ sys_tbl_pa = ei->efi_systab;
+#endif
+ if (!sys_tbl_pa) {
+ debug_putstr("EFI system table not found.");
+ return 0;
+ }
+
+ return sys_tbl_pa;
+}
+
+/*
+ * EFI config table address changes to virtual address after boot, which may
+ * not be accessible for the kexec'd kernel. To address this, kexec provides
+ * the initial physical address via a struct setup_data entry, which is
+ * checked for here, along with some sanity checks.
+ */
+static struct efi_setup_data *get_kexec_setup_data(struct boot_params *bp,
+ enum efi_type et)
+{
+#ifdef CONFIG_X86_64
+ struct efi_setup_data *esd = NULL;
+ struct setup_data *data;
+ u64 pa_data;
+
+ pa_data = bp->hdr.setup_data;
+ while (pa_data) {
+ data = (struct setup_data *)pa_data;
+ if (data->type == SETUP_EFI) {
+ esd = (struct efi_setup_data *)(pa_data + sizeof(struct setup_data));
+ break;
+ }
+
+ pa_data = data->next;
+ }
+
+ /*
+ * Original ACPI code falls back to attempting normal EFI boot in these
+ * cases, so maintain existing behavior by indicating non-kexec
+ * environment to the caller, but print them for debugging.
+ */
+ if (esd && !esd->tables) {
+ debug_putstr("kexec EFI environment missing valid configuration table.\n");
+ return NULL;
+ }
+
+ return esd;
+#endif
+ return NULL;
+}
+
+/**
+ * efi_get_conf_table - Given a pointer to boot_params, locate and return the physical
+ * address of EFI configuration table.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: location to store physical address of config table
+ * @cfg_tbl_len: location to store number of config table entries
+ *
+ * Return: 0 on success. On error, return params are left unchanged.
+ */
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ unsigned long sys_tbl_pa;
+ enum efi_type et;
+ int ret;
+
+ if (!cfg_tbl_pa || !cfg_tbl_len)
+ return -EINVAL;
+
+ sys_tbl_pa = efi_get_system_table(bp);
+ if (!sys_tbl_pa)
+ return -EINVAL;
+
+ /* Handle EFI bitness properly */
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_64) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)sys_tbl_pa;
+ struct efi_setup_data *esd;
+
+ /* kexec provides an alternative EFI conf table, check for it. */
+ esd = get_kexec_setup_data(bp, et);
+
+ *cfg_tbl_pa = esd ? esd->tables : stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else if (et == EFI_TYPE_32) {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)sys_tbl_pa;
+
+ *cfg_tbl_pa = stbl->tables;
+ *cfg_tbl_len = stbl->nr_tables;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Get vendor table address/guid from EFI config table at the given index */
+static int get_vendor_table(void *cfg_tbl, unsigned int idx,
+ unsigned long *vendor_tbl_pa,
+ efi_guid_t *vendor_tbl_guid,
+ enum efi_type et)
+{
+ if (et == EFI_TYPE_64) {
+ efi_config_table_64_t *tbl_entry = (efi_config_table_64_t *)cfg_tbl + idx;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && tbl_entry->table >> 32) {
+ debug_putstr("Error: EFI config table entry located above 4GB.\n");
+ return -EINVAL;
+ }
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+
+ } else if (et == EFI_TYPE_32) {
+ efi_config_table_32_t *tbl_entry = (efi_config_table_32_t *)cfg_tbl + idx;
+
+ *vendor_tbl_pa = tbl_entry->table;
+ *vendor_tbl_guid = tbl_entry->guid;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * efi_find_vendor_table - Given EFI config table, search it for the physical
+ * address of the vendor table associated with GUID.
+ *
+ * @bp: pointer to boot_params
+ * @cfg_tbl_pa: pointer to EFI configuration table
+ * @cfg_tbl_len: number of entries in EFI configuration table
+ * @guid: GUID of vendor table
+ *
+ * Return: vendor table address on success. On error, return 0.
+ */
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ enum efi_type et;
+ unsigned int i;
+
+ et = efi_get_type(bp);
+ if (et == EFI_TYPE_NONE)
+ return 0;
+
+ for (i = 0; i < cfg_tbl_len; i++) {
+ unsigned long vendor_tbl_pa;
+ efi_guid_t vendor_tbl_guid;
+ int ret;
+
+ ret = get_vendor_table((void *)cfg_tbl_pa, i,
+ &vendor_tbl_pa,
+ &vendor_tbl_guid, et);
+ if (ret)
+ return 0;
+
+ if (!efi_guidcmp(guid, vendor_tbl_guid))
+ return vendor_tbl_pa;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h
new file mode 100644
index 000000000000..7db2f41b54cd
--- /dev/null
+++ b/arch/x86/boot/compressed/efi.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_EFI_H
+#define BOOT_COMPRESSED_EFI_H
+
+#if defined(_LINUX_EFI_H) || defined(_ASM_X86_EFI_H)
+#error Please do not include kernel proper namespace headers
+#endif
+
+typedef guid_t efi_guid_t __aligned(__alignof__(u32));
+
+#define EFI_GUID(a, b, c, d...) (efi_guid_t){ { \
+ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, d } }
+
+#define ACPI_TABLE_GUID EFI_GUID(0xeb9d2d30, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
+#define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81)
+#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42)
+
+#define EFI32_LOADER_SIGNATURE "EL32"
+#define EFI64_LOADER_SIGNATURE "EL64"
+
+/*
+ * Generic EFI table header
+ */
+typedef struct {
+ u64 signature;
+ u32 revision;
+ u32 headersize;
+ u32 crc32;
+ u32 reserved;
+} efi_table_hdr_t;
+
+#define EFI_CONVENTIONAL_MEMORY 7
+
+#define EFI_MEMORY_MORE_RELIABLE \
+ ((u64)0x0000000000010000ULL) /* higher reliability */
+#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */
+
+#define EFI_PAGE_SHIFT 12
+
+typedef struct {
+ u32 type;
+ u32 pad;
+ u64 phys_addr;
+ u64 virt_addr;
+ u64 num_pages;
+ u64 attribute;
+} efi_memory_desc_t;
+
+#define efi_early_memdesc_ptr(map, desc_size, n) \
+ (efi_memory_desc_t *)((void *)(map) + ((n) * (desc_size)))
+
+typedef struct {
+ efi_guid_t guid;
+ u64 table;
+} efi_config_table_64_t;
+
+typedef struct {
+ efi_guid_t guid;
+ u32 table;
+} efi_config_table_32_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u64 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 __pad1;
+ u64 con_in_handle;
+ u64 con_in;
+ u64 con_out_handle;
+ u64 con_out;
+ u64 stderr_handle;
+ u64 stderr;
+ u64 runtime;
+ u64 boottime;
+ u32 nr_tables;
+ u32 __pad2;
+ u64 tables;
+} efi_system_table_64_t;
+
+typedef struct {
+ efi_table_hdr_t hdr;
+ u32 fw_vendor; /* physical addr of CHAR16 vendor string */
+ u32 fw_revision;
+ u32 con_in_handle;
+ u32 con_in;
+ u32 con_out_handle;
+ u32 con_out;
+ u32 stderr_handle;
+ u32 stderr;
+ u32 runtime;
+ u32 boottime;
+ u32 nr_tables;
+ u32 tables;
+} efi_system_table_32_t;
+
+/* kexec external ABI */
+struct efi_setup_data {
+ u64 fw_vendor;
+ u64 __unused;
+ u64 tables;
+ u64 smbios;
+ u64 reserved[8];
+};
+
+static inline int efi_guidcmp (efi_guid_t left, efi_guid_t right)
+{
+ return memcmp(&left, &right, sizeof (efi_guid_t));
+}
+
+#ifdef CONFIG_EFI
+bool __pure __efi_soft_reserve_enabled(void);
+
+static inline bool __pure efi_soft_reserve_enabled(void)
+{
+ return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE)
+ && __efi_soft_reserve_enabled();
+}
+#else
+static inline bool efi_soft_reserve_enabled(void)
+{
+ return false;
+}
+#endif /* CONFIG_EFI */
+#endif /* BOOT_COMPRESSED_EFI_H */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index dea95301196b..d33f060900d2 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -189,11 +189,11 @@ SYM_FUNC_START(startup_32)
subl $32, %eax /* Encryption bit is always above bit 31 */
bts %eax, %edx /* Set encryption mask for page tables */
/*
- * Mark SEV as active in sev_status so that startup32_check_sev_cbit()
- * will do a check. The sev_status memory will be fully initialized
- * with the contents of MSR_AMD_SEV_STATUS later in
- * set_sev_encryption_mask(). For now it is sufficient to know that SEV
- * is active.
+ * Set MSR_AMD64_SEV_ENABLED_BIT in sev_status so that
+ * startup32_check_sev_cbit() will do a check. sev_enable() will
+ * initialize sev_status with all the bits reported by
+ * MSR_AMD_SEV_STATUS later, but only MSR_AMD64_SEV_ENABLED_BIT
+ * needs to be set for now.
*/
movl $1, rva(sev_status)(%ebp)
1:
@@ -289,7 +289,7 @@ SYM_FUNC_START(startup_32)
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax /* Enable Paging and Protected mode */
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/* Jump from 32bit compatibility mode into 64bit mode. */
@@ -447,6 +447,23 @@ SYM_CODE_START(startup_64)
call load_stage1_idt
popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Now that the stage1 interrupt handlers are set up, #VC exceptions from
+ * CPUID instructions can be properly handled for SEV-ES guests.
+ *
+ * For SEV-SNP, the CPUID table also needs to be set up in advance of any
+ * CPUID instructions being issued, so go ahead and do that now via
+ * sev_enable(), which will also handle the rest of the SEV-related
+ * detection/setup to ensure that has been done in advance of any dependent
+ * code.
+ */
+ pushq %rsi
+ movq %rsi, %rdi /* real mode address */
+ call sev_enable
+ popq %rsi
+#endif
+
/*
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
@@ -558,17 +575,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx
rep stosq
-/*
- * If running as an SEV guest, the encryption mask is required in the
- * page-table setup code below. When the guest also has SEV-ES enabled
- * set_sev_encryption_mask() will cause #VC exceptions, but the stage2
- * handler can't map its GHCB because the page-table is not set up yet.
- * So set up the encryption mask here while still on the stage1 #VC
- * handler. Then load stage2 IDT and switch to the kernel's own
- * page-table.
- */
pushq %rsi
- call set_sev_encryption_mask
call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */
@@ -642,12 +649,28 @@ SYM_CODE_START(trampoline_32bit_src)
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+ jc 1f
wrmsr
- popl %edx
+1: popl %edx
popl %ecx
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movl %cr4, %eax
+ andl $X86_CR4_MCE, %eax
+#else
+ movl $0, %eax
+#endif
+
/* Enable PAE and LA57 (if required) paging modes */
- movl $X86_CR4_PAE, %eax
+ orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
@@ -661,8 +684,9 @@ SYM_CODE_START(trampoline_32bit_src)
pushl $__KERNEL_CS
pushl %eax
- /* Enable paging again */
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
+ /* Enable paging again. */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
lret
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index f7213d0943b8..44c350d627c7 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -90,7 +90,7 @@ static struct x86_mapping_info mapping_info;
/*
* Adds the specified range to the identity mappings.
*/
-static void add_identity_map(unsigned long start, unsigned long end)
+void kernel_add_identity_map(unsigned long start, unsigned long end)
{
int ret;
@@ -157,14 +157,15 @@ void initialize_identity_maps(void *rmode)
* explicitly here in case the compressed kernel does not touch them,
* or does not touch all the pages covering them.
*/
- add_identity_map((unsigned long)_head, (unsigned long)_end);
+ kernel_add_identity_map((unsigned long)_head, (unsigned long)_end);
boot_params = rmode;
- add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
+ kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1));
cmdline = get_cmd_line_ptr();
- add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+ kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE);
+
+ sev_prep_identity_maps(top_level_pgt);
/* Load the new page-table. */
- sev_verify_cbit(top_level_pgt);
write_cr3(top_level_pgt);
}
@@ -246,10 +247,10 @@ static int set_clr_page_flags(struct x86_mapping_info *info,
* It should already exist, but keep things generic.
*
* To map the page just read from it and fault it in if there is no
- * mapping yet. add_identity_map() can't be called here because that
- * would unconditionally map the address on PMD level, destroying any
- * PTE-level mappings that might already exist. Use assembly here so
- * the access won't be optimized away.
+ * mapping yet. kernel_add_identity_map() can't be called here because
+ * that would unconditionally map the address on PMD level, destroying
+ * any PTE-level mappings that might already exist. Use assembly here
+ * so the access won't be optimized away.
*/
asm volatile("mov %[address], %%r9"
:: [address] "g" (*(unsigned long *)address)
@@ -275,15 +276,31 @@ static int set_clr_page_flags(struct x86_mapping_info *info,
* Changing encryption attributes of a page requires to flush it from
* the caches.
*/
- if ((set | clr) & _PAGE_ENC)
+ if ((set | clr) & _PAGE_ENC) {
clflush_page(address);
+ /*
+ * If the encryption attribute is being cleared, change the page state
+ * to shared in the RMP table.
+ */
+ if (clr)
+ snp_set_page_shared(__pa(address & PAGE_MASK));
+ }
+
/* Update PTE */
pte = *ptep;
pte = pte_set_flags(pte, set);
pte = pte_clear_flags(pte, clr);
set_pte(ptep, pte);
+ /*
+ * If the encryption attribute is being set, then change the page state to
+ * private in the RMP entry. The page state change must be done after the PTE
+ * is updated.
+ */
+ if (set & _PAGE_ENC)
+ snp_set_page_private(__pa(address & PAGE_MASK));
+
/* Flush TLB after changing encryption attribute */
write_cr3(top_level_pgt);
@@ -347,5 +364,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
* Error code is sane - now identity map the 2M region around
* the faulting address.
*/
- add_identity_map(address, end);
+ kernel_add_identity_map(address, end);
}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 9b93567d663a..6debb816e83d 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -39,7 +39,23 @@ void load_stage1_idt(void)
load_boot_idt(&boot_idt_desc);
}
-/* Setup IDT after kernel jumping to .Lrelocated */
+/*
+ * Setup IDT after kernel jumping to .Lrelocated.
+ *
+ * initialize_identity_maps() needs a #PF handler to be setup
+ * in order to be able to fault-in identity mapping ranges; see
+ * do_boot_page_fault().
+ *
+ * This #PF handler setup needs to happen in load_stage2_idt() where the
+ * IDT is loaded and there the #VC IDT entry gets setup too.
+ *
+ * In order to be able to handle #VCs, one needs a GHCB which
+ * gets setup with an already set up pagetable, which is done in
+ * initialize_identity_maps(). And there's the catch 22: the boot #VC
+ * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself
+ * (and, especially set_page_decrypted()) because the SEV-ES setup code
+ * cannot initialize a GHCB as there's no #PF handler yet...
+ */
void load_stage2_idt(void)
{
boot_idt_desc.address = (unsigned long)boot_idt;
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 411b268bc0a2..4a3f223973f4 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -22,15 +22,14 @@
#include "misc.h"
#include "error.h"
#include "../string.h"
+#include "efi.h"
#include <generated/compile.h>
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/ctype.h>
-#include <linux/efi.h>
#include <generated/utsrelease.h>
-#include <asm/efi.h>
#define _SETUP
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index a63424d13627..a73e4d783cae 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -187,42 +187,6 @@ SYM_CODE_END(startup32_vc_handler)
.code64
#include "../../kernel/sev_verify_cbit.S"
-SYM_FUNC_START(set_sev_encryption_mask)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- push %rbp
- push %rdx
-
- movq %rsp, %rbp /* Save current stack pointer */
-
- call get_sev_encryption_bit /* Get the encryption bit position */
- testl %eax, %eax
- jz .Lno_sev_mask
-
- bts %rax, sme_me_mask(%rip) /* Create the encryption mask */
-
- /*
- * Read MSR_AMD64_SEV again and store it to sev_status. Can't do this in
- * get_sev_encryption_bit() because this function is 32-bit code and
- * shared between 64-bit and 32-bit boot path.
- */
- movl $MSR_AMD64_SEV, %ecx /* Read the SEV MSR */
- rdmsr
-
- /* Store MSR value in sev_status */
- shlq $32, %rdx
- orq %rdx, %rax
- movq %rax, sev_status(%rip)
-
-.Lno_sev_mask:
- movq %rbp, %rsp /* Restore original stack pointer */
-
- pop %rdx
- pop %rbp
-#endif
-
- xor %rax, %rax
- RET
-SYM_FUNC_END(set_sev_encryption_mask)
.data
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 1cdcaf34ee36..cf690d8712f4 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -48,12 +48,17 @@ void *memmove(void *dest, const void *src, size_t n);
*/
struct boot_params *boot_params;
+struct port_io_ops pio_ops;
+
memptr free_mem_ptr;
memptr free_mem_end_ptr;
static char *vidmem;
static int vidport;
-static int lines, cols;
+
+/* These might be accessed before .bss is cleared, so use .data instead. */
+static int lines __section(".data");
+static int cols __section(".data");
#ifdef CONFIG_KERNEL_GZIP
#include "../../../../lib/decompress_inflate.c"
@@ -371,6 +376,16 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
lines = boot_params->screen_info.orig_video_lines;
cols = boot_params->screen_info.orig_video_cols;
+ init_default_io_ops();
+
+ /*
+ * Detect TDX guest environment.
+ *
+ * It has to be done before console_init() in order to use
+ * paravirtualized port I/O operations if needed.
+ */
+ early_tdx_detect();
+
console_init();
/*
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 16ed360b6692..4910bf230d7b 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -22,17 +22,21 @@
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
-#include <linux/io.h>
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
+#include "tdx.h"
+
#define BOOT_CTYPE_H
#include <linux/acpi.h>
#define BOOT_BOOT_H
#include "../ctype.h"
+#include "../io.h"
+
+#include "efi.h"
#ifdef CONFIG_X86_64
#define memptr long
@@ -120,17 +124,23 @@ static inline void console_init(void)
{ }
#endif
-void set_sev_encryption_mask(void);
-
#ifdef CONFIG_AMD_MEM_ENCRYPT
+void sev_enable(struct boot_params *bp);
void sev_es_shutdown_ghcb(void);
extern bool sev_es_check_ghcb_fault(unsigned long address);
+void snp_set_page_private(unsigned long paddr);
+void snp_set_page_shared(unsigned long paddr);
+void sev_prep_identity_maps(unsigned long top_level_pgt);
#else
+static inline void sev_enable(struct boot_params *bp) { }
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
return false;
}
+static inline void snp_set_page_private(unsigned long paddr) { }
+static inline void snp_set_page_shared(unsigned long paddr) { }
+static inline void sev_prep_identity_maps(unsigned long top_level_pgt) { }
#endif
/* acpi.c */
@@ -151,6 +161,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
#endif
+extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */
extern pteval_t __default_kernel_pte_mask;
@@ -172,4 +183,47 @@ void boot_stage2_vc(void);
unsigned long sev_verify_cbit(unsigned long cr3);
+enum efi_type {
+ EFI_TYPE_64,
+ EFI_TYPE_32,
+ EFI_TYPE_NONE,
+};
+
+#ifdef CONFIG_EFI
+/* helpers for early EFI config table access */
+enum efi_type efi_get_type(struct boot_params *bp);
+unsigned long efi_get_system_table(struct boot_params *bp);
+int efi_get_conf_table(struct boot_params *bp, unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len);
+unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid);
+#else
+static inline enum efi_type efi_get_type(struct boot_params *bp)
+{
+ return EFI_TYPE_NONE;
+}
+
+static inline unsigned long efi_get_system_table(struct boot_params *bp)
+{
+ return 0;
+}
+
+static inline int efi_get_conf_table(struct boot_params *bp,
+ unsigned long *cfg_tbl_pa,
+ unsigned int *cfg_tbl_len)
+{
+ return -ENOENT;
+}
+
+static inline unsigned long efi_find_vendor_table(struct boot_params *bp,
+ unsigned long cfg_tbl_pa,
+ unsigned int cfg_tbl_len,
+ efi_guid_t guid)
+{
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 6ff7e81b5628..cc9b2529a086 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index a1733319a22a..2ac12ff4111b 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,11 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
-#include <linux/efi.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
-#include <asm/efi.h>
#include "pgtable.h"
#include "../string.h"
+#include "efi.h"
#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */
#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 28bcf04c022e..52f989f6acc2 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -20,8 +20,10 @@
#include <asm/fpu/xcr.h>
#include <asm/ptrace.h>
#include <asm/svm.h>
+#include <asm/cpuid.h>
#include "error.h"
+#include "../msr.h"
struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
struct ghcb *boot_ghcb;
@@ -56,23 +58,19 @@ static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
static inline u64 sev_es_rd_ghcb_msr(void)
{
- unsigned long low, high;
+ struct msr m;
- asm volatile("rdmsr" : "=a" (low), "=d" (high) :
- "c" (MSR_AMD64_SEV_ES_GHCB));
+ boot_rdmsr(MSR_AMD64_SEV_ES_GHCB, &m);
- return ((high << 32) | low);
+ return m.q;
}
static inline void sev_es_wr_ghcb_msr(u64 val)
{
- u32 low, high;
+ struct msr m;
- low = val & 0xffffffffUL;
- high = val >> 32;
-
- asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB),
- "a"(low), "d" (high) : "memory");
+ m.q = val;
+ boot_wrmsr(MSR_AMD64_SEV_ES_GHCB, &m);
}
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
@@ -119,11 +117,54 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
/* Include code for early handlers */
#include "../../kernel/sev-shared.c"
-static bool early_setup_sev_es(void)
+static inline bool sev_snp_enabled(void)
+{
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+}
+
+static void __page_state_change(unsigned long paddr, enum psc_op op)
+{
+ u64 val;
+
+ if (!sev_snp_enabled())
+ return;
+
+ /*
+ * If private -> shared then invalidate the page before requesting the
+ * state change in the RMP table.
+ */
+ if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ /* Issue VMGEXIT to change the page state in RMP table. */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ /* Read the response of the VMGEXIT. */
+ val = sev_es_rd_ghcb_msr();
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+
+ /*
+ * Now that page state is changed in the RMP table, validate it so that it is
+ * consistent with the RMP entry.
+ */
+ if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+}
+
+void snp_set_page_private(unsigned long paddr)
+{
+ __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE);
+}
+
+void snp_set_page_shared(unsigned long paddr)
{
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED);
+ __page_state_change(paddr, SNP_PAGE_STATE_SHARED);
+}
+static bool early_setup_ghcb(void)
+{
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -135,6 +176,10 @@ static bool early_setup_sev_es(void)
/* Initialize lookup tables for the instruction decoder */
inat_init_tables();
+ /* SNP guest requires the GHCB GPA must be registered */
+ if (sev_snp_enabled())
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
+
return true;
}
@@ -174,8 +219,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
struct es_em_ctxt ctxt;
enum es_result result;
- if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
+ if (!boot_ghcb && !early_setup_ghcb())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -202,5 +247,191 @@ finish:
if (result == ES_OK)
vc_finish_insn(&ctxt);
else if (result != ES_RETRY)
- sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+static void enforce_vmpl0(void)
+{
+ u64 attrs;
+ int err;
+
+ /*
+ * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically
+ * higher) privilege level. Here, clear the VMPL1 permission mask of the
+ * GHCB page. If the guest is not running at VMPL0, this will fail.
+ *
+ * If the guest is running at VMPL0, it will succeed. Even if that operation
+ * modifies permission bits, it is still ok to do so currently because Linux
+ * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks
+ * changing is a don't-care.
+ */
+ attrs = 1;
+ if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
+}
+
+void sev_enable(struct boot_params *bp)
+{
+ unsigned int eax, ebx, ecx, edx;
+ struct msr m;
+ bool snp;
+
+ /*
+ * Setup/preliminary detection of SNP. This will be sanity-checked
+ * against CPUID/MSR values later.
+ */
+ snp = snp_init(bp);
+
+ /* Check for the SME/SEV support leaf */
+ eax = 0x80000000;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (eax < 0x8000001f)
+ return;
+
+ /*
+ * Check for the SME/SEV feature:
+ * CPUID Fn8000_001F[EAX]
+ * - Bit 0 - Secure Memory Encryption support
+ * - Bit 1 - Secure Encrypted Virtualization support
+ * CPUID Fn8000_001F[EBX]
+ * - Bits 5:0 - Pagetable bit position used to indicate encryption
+ */
+ eax = 0x8000001f;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ /* Check whether SEV is supported */
+ if (!(eax & BIT(1))) {
+ if (snp)
+ error("SEV-SNP support indicated by CC blob, but not CPUID.");
+ return;
+ }
+
+ /* Set the SME mask if this is an SEV guest. */
+ boot_rdmsr(MSR_AMD64_SEV, &m);
+ sev_status = m.q;
+ if (!(sev_status & MSR_AMD64_SEV_ENABLED))
+ return;
+
+ /* Negotiate the GHCB protocol version. */
+ if (sev_status & MSR_AMD64_SEV_ES_ENABLED) {
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED);
+ }
+
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) {
+ if (!(get_hv_features() & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ enforce_vmpl0();
+ }
+
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
+
+ sme_me_mask = BIT_ULL(ebx & 0x3f);
+}
+
+/* Search for Confidential Computing blob in the EFI config table. */
+static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp)
+{
+ unsigned long cfg_table_pa;
+ unsigned int cfg_table_len;
+ int ret;
+
+ ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len);
+ if (ret)
+ return NULL;
+
+ return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa,
+ cfg_table_len,
+ EFI_CC_BLOB_GUID);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the boot kernel
+ * by firmware/bootloader in the following ways:
+ *
+ * - via an entry in the EFI config table
+ * - via a setup_data structure, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ cc_info = find_cc_blob_efi(bp);
+ if (cc_info)
+ goto found_cc_info;
+
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ return cc_info;
+}
+
+/*
+ * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks
+ * will verify the SNP CPUID/MSR bits.
+ */
+bool snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ /*
+ * If a SNP-specific Confidential Computing blob is present, then
+ * firmware/bootloader have indicated SNP support. Verifying this
+ * involves CPUID checks which will be more reliable if the SNP
+ * CPUID table is used. See comments over snp_setup_cpuid_table() for
+ * more details.
+ */
+ setup_cpuid_table(cc_info);
+
+ /*
+ * Pass run-time kernel a pointer to CC info via boot_params so EFI
+ * config table doesn't need to be searched again during early startup
+ * phase.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void sev_prep_identity_maps(unsigned long top_level_pgt)
+{
+ /*
+ * The Confidential Computing blob is used very early in uncompressed
+ * kernel to find the in-memory CPUID table to handle CPUID
+ * instructions. Make sure an identity-mapping exists so it can be
+ * accessed after switchover.
+ */
+ if (sev_snp_enabled()) {
+ unsigned long cc_info_pa = boot_params->cc_blob_address;
+ struct cc_blob_sev_info *cc_info;
+
+ kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info));
+
+ cc_info = (struct cc_blob_sev_info *)cc_info_pa;
+ kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len);
+ }
+
+ sev_verify_cbit(top_level_pgt);
}
diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S
new file mode 100644
index 000000000000..46d0495e0d3a
--- /dev/null
+++ b/arch/x86/boot/compressed/tdcall.S
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "../../coco/tdx/tdcall.S"
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
new file mode 100644
index 000000000000..918a7606f53c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../cpuflags.h"
+#include "../string.h"
+#include "../io.h"
+#include "error.h"
+
+#include <vdso/limits.h>
+#include <uapi/asm/vmx.h>
+
+#include <asm/shared/tdx.h>
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ error("TDVMCALL failed. TDX module bug?");
+}
+
+static inline unsigned int tdx_io_in(int size, u16 port)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_IO_INSTRUCTION,
+ .r12 = size,
+ .r13 = 0,
+ .r14 = port,
+ };
+
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return UINT_MAX;
+
+ return args.r11;
+}
+
+static inline void tdx_io_out(int size, u16 port, u32 value)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_IO_INSTRUCTION,
+ .r12 = size,
+ .r13 = 1,
+ .r14 = port,
+ .r15 = value,
+ };
+
+ __tdx_hypercall(&args, 0);
+}
+
+static inline u8 tdx_inb(u16 port)
+{
+ return tdx_io_in(1, port);
+}
+
+static inline void tdx_outb(u8 value, u16 port)
+{
+ tdx_io_out(1, port, value);
+}
+
+static inline void tdx_outw(u16 value, u16 port)
+{
+ tdx_io_out(2, port, value);
+}
+
+void early_tdx_detect(void)
+{
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ /* Use hypercalls instead of I/O instructions */
+ pio_ops.f_inb = tdx_inb;
+ pio_ops.f_outb = tdx_outb;
+ pio_ops.f_outw = tdx_outw;
+}
diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h
new file mode 100644
index 000000000000..9055482cd35c
--- /dev/null
+++ b/arch/x86/boot/compressed/tdx.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_COMPRESSED_TDX_H
+#define BOOT_COMPRESSED_TDX_H
+
+#include <linux/types.h>
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+void early_tdx_detect(void);
+#else
+static inline void early_tdx_detect(void) { };
+#endif
+
+#endif /* BOOT_COMPRESSED_TDX_H */
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index e1478d32de1a..fed8d13ce252 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -27,6 +27,7 @@
#include <asm/required-features.h>
#include <asm/msr-index.h>
#include "string.h"
+#include "msr.h"
static u32 err_flags[NCAPINTS];
@@ -130,12 +131,11 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
/* If this is an AMD and we're only missing SSE+SSE2, try to
turn them on */
- u32 ecx = MSR_K7_HWCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax &= ~(1 << 15);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_rdmsr(MSR_K7_HWCR, &m);
+ m.l &= ~(1 << 15);
+ boot_wrmsr(MSR_K7_HWCR, &m);
get_cpuflags(); /* Make sure it really did something */
err = check_cpuflags();
@@ -145,28 +145,28 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
/* If this is a VIA C3, we might have to enable CX8
explicitly */
- u32 ecx = MSR_VIA_FCR;
- u32 eax, edx;
+ struct msr m;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- eax |= (1<<1)|(1<<7);
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_rdmsr(MSR_VIA_FCR, &m);
+ m.l |= (1 << 1) | (1 << 7);
+ boot_wrmsr(MSR_VIA_FCR, &m);
set_bit(X86_FEATURE_CX8, cpu.flags);
err = check_cpuflags();
} else if (err == 0x01 && is_transmeta()) {
/* Transmeta might have masked feature bits in word 0 */
- u32 ecx = 0x80860004;
- u32 eax, edx;
+ struct msr m, m_tmp;
u32 level = 1;
- asm("rdmsr" : "=a" (eax), "=d" (edx) : "c" (ecx));
- asm("wrmsr" : : "a" (~0), "d" (edx), "c" (ecx));
+ boot_rdmsr(0x80860004, &m);
+ m_tmp = m;
+ m_tmp.l = ~0;
+ boot_wrmsr(0x80860004, &m_tmp);
asm("cpuid"
: "+a" (level), "=d" (cpu.flags[0])
: : "ecx", "ebx");
- asm("wrmsr" : : "a" (eax), "d" (edx), "c" (ecx));
+ boot_wrmsr(0x80860004, &m);
err = check_cpuflags();
} else if (err == 0x01 &&
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a0b75f73dc63..a83d67ec627d 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -71,8 +71,7 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif
-static inline void cpuid_count(u32 id, u32 count,
- u32 *a, u32 *b, u32 *c, u32 *d)
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index 2e20814d3ce3..475b8fde90f7 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -17,5 +17,6 @@ extern u32 cpu_vendor[3];
int has_eflag(unsigned long mask);
void get_cpuflags(void);
+void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d);
#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 6dbd7e9f74c9..0352e4589efa 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -163,7 +163,11 @@ extra_header_fields:
.long 0x200 # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
+#ifdef CONFIG_DXE_MEM_ATTRIBUTES
+ .word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
+#else
.word 0 # DllCharacteristics
+#endif
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
diff --git a/arch/x86/boot/io.h b/arch/x86/boot/io.h
new file mode 100644
index 000000000000..110880907f87
--- /dev/null
+++ b/arch/x86/boot/io.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BOOT_IO_H
+#define BOOT_IO_H
+
+#include <asm/shared/io.h>
+
+#undef inb
+#undef inw
+#undef inl
+#undef outb
+#undef outw
+#undef outl
+
+struct port_io_ops {
+ u8 (*f_inb)(u16 port);
+ void (*f_outb)(u8 v, u16 port);
+ void (*f_outw)(u16 v, u16 port);
+};
+
+extern struct port_io_ops pio_ops;
+
+/*
+ * Use the normal I/O instructions by default.
+ * TDX guests override these to use hypercalls.
+ */
+static inline void init_default_io_ops(void)
+{
+ pio_ops.f_inb = __inb;
+ pio_ops.f_outb = __outb;
+ pio_ops.f_outw = __outw;
+}
+
+/*
+ * Redirect port I/O operations via pio_ops callbacks.
+ * TDX guests override these callbacks with TDX-specific helpers.
+ */
+#define inb pio_ops.f_inb
+#define outb pio_ops.f_outb
+#define outw pio_ops.f_outw
+
+#endif
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index e3add857c2c9..c4ea5258ab55 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -17,6 +17,8 @@
struct boot_params boot_params __attribute__((aligned(16)));
+struct port_io_ops pio_ops;
+
char *HEAP = _end;
char *heap_end = _end; /* Default end of heap = no heap */
@@ -33,7 +35,7 @@ static void copy_boot_params(void)
u16 cl_offset;
};
const struct old_cmdline * const oldcmd =
- (const struct old_cmdline *)OLD_CL_ADDRESS;
+ absolute_pointer(OLD_CL_ADDRESS);
BUILD_BUG_ON(sizeof(boot_params) != 4096);
memcpy(&boot_params.hdr, &hdr, sizeof(hdr));
@@ -133,6 +135,8 @@ static void init_heap(void)
void main(void)
{
+ init_default_io_ops();
+
/* First, copy the boot header into the "zeropage" */
copy_boot_params();
diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h
new file mode 100644
index 000000000000..aed66f7ae199
--- /dev/null
+++ b/arch/x86/boot/msr.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Helpers/definitions related to MSR access.
+ */
+
+#ifndef BOOT_MSR_H
+#define BOOT_MSR_H
+
+#include <asm/shared/msr.h>
+
+/*
+ * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the
+ * boot kernel since they rely on tracepoint/exception handling infrastructure
+ * that's not available here.
+ */
+static inline void boot_rdmsr(unsigned int reg, struct msr *m)
+{
+ asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg));
+}
+
+static inline void boot_wrmsr(unsigned int reg, const struct msr *m)
+{
+ asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory");
+}
+
+#endif /* BOOT_MSR_H */
diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile
index c1ead00017a7..c816acf78b6a 100644
--- a/arch/x86/coco/Makefile
+++ b/arch/x86/coco/Makefile
@@ -4,3 +4,5 @@ KASAN_SANITIZE_core.o := n
CFLAGS_core.o += -fno-stack-protector
obj-y += core.o
+
+obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index fc1365dd927e..49b44f881484 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -18,7 +18,15 @@ static u64 cc_mask __ro_after_init;
static bool intel_cc_platform_has(enum cc_attr attr)
{
- return false;
+ switch (attr) {
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ case CC_ATTR_HOTPLUG_DISABLED:
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
}
/*
@@ -57,6 +65,9 @@ static bool amd_cc_platform_has(enum cc_attr attr)
return (sev_status & MSR_AMD64_SEV_ENABLED) &&
!(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+ case CC_ATTR_GUEST_SEV_SNP:
+ return sev_status & MSR_AMD64_SEV_SNP_ENABLED;
+
default:
return false;
}
@@ -87,9 +98,18 @@ EXPORT_SYMBOL_GPL(cc_platform_has);
u64 cc_mkenc(u64 val)
{
+ /*
+ * Both AMD and Intel use a bit in the page table to indicate
+ * encryption status of the page.
+ *
+ * - for AMD, bit *set* means the page is encrypted
+ * - for Intel *clear* means encrypted.
+ */
switch (vendor) {
case CC_VENDOR_AMD:
return val | cc_mask;
+ case CC_VENDOR_INTEL:
+ return val & ~cc_mask;
default:
return val;
}
@@ -97,9 +117,12 @@ u64 cc_mkenc(u64 val)
u64 cc_mkdec(u64 val)
{
+ /* See comment in cc_mkenc() */
switch (vendor) {
case CC_VENDOR_AMD:
return val & ~cc_mask;
+ case CC_VENDOR_INTEL:
+ return val | cc_mask;
default:
return val;
}
diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile
new file mode 100644
index 000000000000..46c55998557d
--- /dev/null
+++ b/arch/x86/coco/tdx/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y += tdx.o tdcall.o
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
new file mode 100644
index 000000000000..f9eb1134f22d
--- /dev/null
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/asm-offsets.h>
+#include <asm/asm.h>
+#include <asm/frame.h>
+#include <asm/unwind_hints.h>
+
+#include <linux/linkage.h>
+#include <linux/bits.h>
+#include <linux/errno.h>
+
+#include "../../virt/vmx/tdx/tdxcall.S"
+
+/*
+ * Bitmasks of exposed registers (with VMM).
+ */
+#define TDX_R10 BIT(10)
+#define TDX_R11 BIT(11)
+#define TDX_R12 BIT(12)
+#define TDX_R13 BIT(13)
+#define TDX_R14 BIT(14)
+#define TDX_R15 BIT(15)
+
+/*
+ * These registers are clobbered to hold arguments for each
+ * TDVMCALL. They are safe to expose to the VMM.
+ * Each bit in this mask represents a register ID. Bit field
+ * details can be found in TDX GHCI specification, section
+ * titled "TDCALL [TDG.VP.VMCALL] leaf".
+ */
+#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \
+ TDX_R12 | TDX_R13 | \
+ TDX_R14 | TDX_R15 )
+
+/*
+ * __tdx_module_call() - Used by TDX guests to request services from
+ * the TDX module (does not include VMM services) using TDCALL instruction.
+ *
+ * Transforms function call register arguments into the TDCALL register ABI.
+ * After TDCALL operation, TDX module output is saved in @out (if it is
+ * provided by the user).
+ *
+ *-------------------------------------------------------------------------
+ * TDCALL ABI:
+ *-------------------------------------------------------------------------
+ * Input Registers:
+ *
+ * RAX - TDCALL Leaf number.
+ * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers.
+ *
+ * Output Registers:
+ *
+ * RAX - TDCALL instruction error code.
+ * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __tdx_module_call() function ABI:
+ *
+ * @fn (RDI) - TDCALL Leaf ID, moved to RAX
+ * @rcx (RSI) - Input parameter 1, moved to RCX
+ * @rdx (RDX) - Input parameter 2, moved to RDX
+ * @r8 (RCX) - Input parameter 3, moved to R8
+ * @r9 (R8) - Input parameter 4, moved to R9
+ *
+ * @out (R9) - struct tdx_module_output pointer
+ * stored temporarily in R12 (not
+ * shared with the TDX module). It
+ * can be NULL.
+ *
+ * Return status of TDCALL via RAX.
+ */
+SYM_FUNC_START(__tdx_module_call)
+ FRAME_BEGIN
+ TDX_MODULE_CALL host=0
+ FRAME_END
+ RET
+SYM_FUNC_END(__tdx_module_call)
+
+/*
+ * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf
+ * of TDCALL instruction
+ *
+ * Transforms values in function call argument struct tdx_hypercall_args @args
+ * into the TDCALL register ABI. After TDCALL operation, VMM output is saved
+ * back in @args.
+ *
+ *-------------------------------------------------------------------------
+ * TD VMCALL ABI:
+ *-------------------------------------------------------------------------
+ *
+ * Input Registers:
+ *
+ * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL)
+ * RCX - BITMAP which controls which part of TD Guest GPR
+ * is passed as-is to the VMM and back.
+ * R10 - Set 0 to indicate TDCALL follows standard TDX ABI
+ * specification. Non zero value indicates vendor
+ * specific ABI.
+ * R11 - VMCALL sub function number
+ * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments.
+ * R8-R9, R12-R15 - Same as above.
+ *
+ * Output Registers:
+ *
+ * RAX - TDCALL instruction status (Not related to hypercall
+ * output).
+ * R10 - Hypercall output error code.
+ * R11-R15 - Hypercall sub function specific output values.
+ *
+ *-------------------------------------------------------------------------
+ *
+ * __tdx_hypercall() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input and output
+ * @flags (RSI) - TDX_HCALL_* flags
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall)
+ FRAME_BEGIN
+
+ /* Save callee-saved GPRs as mandated by the x86_64 ABI */
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+
+ /* Mangle function call ABI into TDCALL ABI: */
+ /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
+ xor %eax, %eax
+
+ /* Copy hypercall registers from arg struct: */
+ movq TDX_HYPERCALL_r10(%rdi), %r10
+ movq TDX_HYPERCALL_r11(%rdi), %r11
+ movq TDX_HYPERCALL_r12(%rdi), %r12
+ movq TDX_HYPERCALL_r13(%rdi), %r13
+ movq TDX_HYPERCALL_r14(%rdi), %r14
+ movq TDX_HYPERCALL_r15(%rdi), %r15
+
+ movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
+
+ /*
+ * For the idle loop STI needs to be called directly before the TDCALL
+ * that enters idle (EXIT_REASON_HLT case). STI instruction enables
+ * interrupts only one instruction later. If there is a window between
+ * STI and the instruction that emulates the HALT state, there is a
+ * chance for interrupts to happen in this window, which can delay the
+ * HLT operation indefinitely. Since this is the not the desired
+ * result, conditionally call STI before TDCALL.
+ */
+ testq $TDX_HCALL_ISSUE_STI, %rsi
+ jz .Lskip_sti
+ sti
+.Lskip_sti:
+ tdcall
+
+ /*
+ * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that
+ * something has gone horribly wrong with the TDX module.
+ *
+ * The return status of the hypercall operation is in a separate
+ * register (in R10). Hypercall errors are a part of normal operation
+ * and are handled by callers.
+ */
+ testq %rax, %rax
+ jne .Lpanic
+
+ /* TDVMCALL leaf return code is in R10 */
+ movq %r10, %rax
+
+ /* Copy hypercall result registers to arg struct if needed */
+ testq $TDX_HCALL_HAS_OUTPUT, %rsi
+ jz .Lout
+
+ movq %r10, TDX_HYPERCALL_r10(%rdi)
+ movq %r11, TDX_HYPERCALL_r11(%rdi)
+ movq %r12, TDX_HYPERCALL_r12(%rdi)
+ movq %r13, TDX_HYPERCALL_r13(%rdi)
+ movq %r14, TDX_HYPERCALL_r14(%rdi)
+ movq %r15, TDX_HYPERCALL_r15(%rdi)
+.Lout:
+ /*
+ * Zero out registers exposed to the VMM to avoid speculative execution
+ * with VMM-controlled values. This needs to include all registers
+ * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
+ * context will be restored.
+ */
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+
+ /* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ FRAME_END
+
+ RET
+.Lpanic:
+ call __tdx_hypercall_failed
+ /* __tdx_hypercall_failed never returns */
+ REACHABLE
+ jmp .Lpanic
+SYM_FUNC_END(__tdx_hypercall)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
new file mode 100644
index 000000000000..03deb4d6920d
--- /dev/null
+++ b/arch/x86/coco/tdx/tdx.c
@@ -0,0 +1,692 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2021-2022 Intel Corporation */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tdx: " fmt
+
+#include <linux/cpufeature.h>
+#include <asm/coco.h>
+#include <asm/tdx.h>
+#include <asm/vmx.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/pgtable.h>
+
+/* TDX module Call Leaf IDs */
+#define TDX_GET_INFO 1
+#define TDX_GET_VEINFO 3
+#define TDX_ACCEPT_PAGE 6
+
+/* TDX hypercall Leaf IDs */
+#define TDVMCALL_MAP_GPA 0x10001
+
+/* MMIO direction */
+#define EPT_READ 0
+#define EPT_WRITE 1
+
+/* Port I/O direction */
+#define PORT_READ 0
+#define PORT_WRITE 1
+
+/* See Exit Qualification for I/O Instructions in VMX documentation */
+#define VE_IS_IO_IN(e) ((e) & BIT(3))
+#define VE_GET_IO_SIZE(e) (((e) & GENMASK(2, 0)) + 1)
+#define VE_GET_PORT_NUM(e) ((e) >> 16)
+#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+
+/*
+ * Wrapper for standard use of __tdx_hypercall with no output aside from
+ * return code.
+ */
+static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = fn,
+ .r12 = r12,
+ .r13 = r13,
+ .r14 = r14,
+ .r15 = r15,
+ };
+
+ return __tdx_hypercall(&args, 0);
+}
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void)
+{
+ panic("TDVMCALL failed. TDX module bug?");
+}
+
+/*
+ * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined
+ * independently from but are currently matched 1:1 with VMX EXIT_REASONs.
+ * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
+ * guest sides of these calls.
+ */
+static u64 hcall_func(u64 exit_reason)
+{
+ return exit_reason;
+}
+
+#ifdef CONFIG_KVM_GUEST
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = nr,
+ .r11 = p1,
+ .r12 = p2,
+ .r13 = p3,
+ .r14 = p4,
+ };
+
+ return __tdx_hypercall(&args, 0);
+}
+EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
+#endif
+
+/*
+ * Used for TDX guests to make calls directly to the TD module. This
+ * should only be used for calls that have no legitimate reason to fail
+ * or where the kernel can not survive the call failing.
+ */
+static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+ struct tdx_module_output *out)
+{
+ if (__tdx_module_call(fn, rcx, rdx, r8, r9, out))
+ panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
+}
+
+static u64 get_cc_mask(void)
+{
+ struct tdx_module_output out;
+ unsigned int gpa_width;
+
+ /*
+ * TDINFO TDX module call is used to get the TD execution environment
+ * information like GPA width, number of available vcpus, debug mode
+ * information, etc. More details about the ABI can be found in TDX
+ * Guest-Host-Communication Interface (GHCI), section 2.4.2 TDCALL
+ * [TDG.VP.INFO].
+ *
+ * The GPA width that comes out of this call is critical. TDX guests
+ * can not meaningfully run without it.
+ */
+ tdx_module_call(TDX_GET_INFO, 0, 0, 0, 0, &out);
+
+ gpa_width = out.rcx & GENMASK(5, 0);
+
+ /*
+ * The highest bit of a guest physical address is the "sharing" bit.
+ * Set it for shared pages and clear it for private pages.
+ */
+ return BIT_ULL(gpa_width - 1);
+}
+
+static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_HLT),
+ .r12 = irq_disabled,
+ };
+
+ /*
+ * Emulate HLT operation via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section 3.8 TDG.VP.VMCALL<Instruction.HLT>.
+ *
+ * The VMM uses the "IRQ disabled" param to understand IRQ
+ * enabled status (RFLAGS.IF) of the TD guest and to determine
+ * whether or not it should schedule the halted vCPU if an
+ * IRQ becomes pending. E.g. if IRQs are disabled, the VMM
+ * can keep the vCPU in virtual HLT, even if an IRQ is
+ * pending, without hanging/breaking the guest.
+ */
+ return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
+}
+
+static bool handle_halt(void)
+{
+ /*
+ * Since non safe halt is mainly used in CPU offlining
+ * and the guest will always stay in the halt state, don't
+ * call the STI instruction (set do_sti as false).
+ */
+ const bool irq_disabled = irqs_disabled();
+ const bool do_sti = false;
+
+ if (__halt(irq_disabled, do_sti))
+ return false;
+
+ return true;
+}
+
+void __cpuidle tdx_safe_halt(void)
+{
+ /*
+ * For do_sti=true case, __tdx_hypercall() function enables
+ * interrupts using the STI instruction before the TDCALL. So
+ * set irq_disabled as false.
+ */
+ const bool irq_disabled = false;
+ const bool do_sti = true;
+
+ /*
+ * Use WARN_ONCE() to report the failure.
+ */
+ if (__halt(irq_disabled, do_sti))
+ WARN_ONCE(1, "HLT instruction emulation failed\n");
+}
+
+static bool read_msr(struct pt_regs *regs)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_READ),
+ .r12 = regs->cx,
+ };
+
+ /*
+ * Emulate the MSR read via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
+ */
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return false;
+
+ regs->ax = lower_32_bits(args.r11);
+ regs->dx = upper_32_bits(args.r11);
+ return true;
+}
+
+static bool write_msr(struct pt_regs *regs)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_MSR_WRITE),
+ .r12 = regs->cx,
+ .r13 = (u64)regs->dx << 32 | regs->ax,
+ };
+
+ /*
+ * Emulate the MSR write via hypercall. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface
+ * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
+ */
+ return !__tdx_hypercall(&args, 0);
+}
+
+static bool handle_cpuid(struct pt_regs *regs)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_CPUID),
+ .r12 = regs->ax,
+ .r13 = regs->cx,
+ };
+
+ /*
+ * Only allow VMM to control range reserved for hypervisor
+ * communication.
+ *
+ * Return all-zeros for any CPUID outside the range. It matches CPU
+ * behaviour for non-supported leaf.
+ */
+ if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
+ regs->ax = regs->bx = regs->cx = regs->dx = 0;
+ return true;
+ }
+
+ /*
+ * Emulate the CPUID instruction via a hypercall. More info about
+ * ABI can be found in TDX Guest-Host-Communication Interface
+ * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
+ */
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return false;
+
+ /*
+ * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
+ * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
+ * So copy the register contents back to pt_regs.
+ */
+ regs->ax = args.r12;
+ regs->bx = args.r13;
+ regs->cx = args.r14;
+ regs->dx = args.r15;
+
+ return true;
+}
+
+static bool mmio_read(int size, unsigned long addr, unsigned long *val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_EPT_VIOLATION),
+ .r12 = size,
+ .r13 = EPT_READ,
+ .r14 = addr,
+ .r15 = *val,
+ };
+
+ if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ return false;
+ *val = args.r11;
+ return true;
+}
+
+static bool mmio_write(int size, unsigned long addr, unsigned long val)
+{
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_EPT_VIOLATION), size,
+ EPT_WRITE, addr, val);
+}
+
+static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+{
+ char buffer[MAX_INSN_SIZE];
+ unsigned long *reg, val;
+ struct insn insn = {};
+ enum mmio_type mmio;
+ int size, extend_size;
+ u8 extend_val = 0;
+
+ /* Only in-kernel MMIO is supported */
+ if (WARN_ON_ONCE(user_mode(regs)))
+ return false;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
+ return false;
+
+ mmio = insn_decode_mmio(&insn, &size);
+ if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
+ return false;
+
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg = insn_get_modrm_reg_ptr(&insn, regs);
+ if (!reg)
+ return false;
+ }
+
+ ve->instr_len = insn.length;
+
+ /* Handle writes first */
+ switch (mmio) {
+ case MMIO_WRITE:
+ memcpy(&val, reg, size);
+ return mmio_write(size, ve->gpa, val);
+ case MMIO_WRITE_IMM:
+ val = insn.immediate.value;
+ return mmio_write(size, ve->gpa, val);
+ case MMIO_READ:
+ case MMIO_READ_ZERO_EXTEND:
+ case MMIO_READ_SIGN_EXTEND:
+ /* Reads are handled below */
+ break;
+ case MMIO_MOVS:
+ case MMIO_DECODE_FAILED:
+ /*
+ * MMIO was accessed with an instruction that could not be
+ * decoded or handled properly. It was likely not using io.h
+ * helpers or accessed MMIO accidentally.
+ */
+ return false;
+ default:
+ WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
+ return false;
+ }
+
+ /* Handle reads */
+ if (!mmio_read(size, ve->gpa, &val))
+ return false;
+
+ switch (mmio) {
+ case MMIO_READ:
+ /* Zero-extend for 32-bit operation */
+ extend_size = size == 4 ? sizeof(*reg) : 0;
+ break;
+ case MMIO_READ_ZERO_EXTEND:
+ /* Zero extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ /* Sign extend based on operand size */
+ extend_size = insn.opnd_bytes;
+ if (size == 1 && val & BIT(7))
+ extend_val = 0xFF;
+ else if (size > 1 && val & BIT(15))
+ extend_val = 0xFF;
+ break;
+ default:
+ /* All other cases has to be covered with the first switch() */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ if (extend_size)
+ memset(reg, extend_val, extend_size);
+ memcpy(reg, &val, size);
+ return true;
+}
+
+static bool handle_in(struct pt_regs *regs, int size, int port)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = hcall_func(EXIT_REASON_IO_INSTRUCTION),
+ .r12 = size,
+ .r13 = PORT_READ,
+ .r14 = port,
+ };
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+ bool success;
+
+ /*
+ * Emulate the I/O read via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
+
+ /* Update part of the register affected by the emulated instruction */
+ regs->ax &= ~mask;
+ if (success)
+ regs->ax |= args.r11 & mask;
+
+ return success;
+}
+
+static bool handle_out(struct pt_regs *regs, int size, int port)
+{
+ u64 mask = GENMASK(BITS_PER_BYTE * size, 0);
+
+ /*
+ * Emulate the I/O write via hypercall. More info about ABI can be found
+ * in TDX Guest-Host-Communication Interface (GHCI) section titled
+ * "TDG.VP.VMCALL<Instruction.IO>".
+ */
+ return !_tdx_hypercall(hcall_func(EXIT_REASON_IO_INSTRUCTION), size,
+ PORT_WRITE, port, regs->ax & mask);
+}
+
+/*
+ * Emulate I/O using hypercall.
+ *
+ * Assumes the IO instruction was using ax, which is enforced
+ * by the standard io.h macros.
+ *
+ * Return True on success or False on failure.
+ */
+static bool handle_io(struct pt_regs *regs, u32 exit_qual)
+{
+ int size, port;
+ bool in;
+
+ if (VE_IS_IO_STRING(exit_qual))
+ return false;
+
+ in = VE_IS_IO_IN(exit_qual);
+ size = VE_GET_IO_SIZE(exit_qual);
+ port = VE_GET_PORT_NUM(exit_qual);
+
+
+ if (in)
+ return handle_in(regs, size, port);
+ else
+ return handle_out(regs, size, port);
+}
+
+/*
+ * Early #VE exception handler. Only handles a subset of port I/O.
+ * Intended only for earlyprintk. If failed, return false.
+ */
+__init bool tdx_early_handle_ve(struct pt_regs *regs)
+{
+ struct ve_info ve;
+
+ tdx_get_ve_info(&ve);
+
+ if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
+ return false;
+
+ return handle_io(regs, ve.exit_qual);
+}
+
+void tdx_get_ve_info(struct ve_info *ve)
+{
+ struct tdx_module_output out;
+
+ /*
+ * Called during #VE handling to retrieve the #VE info from the
+ * TDX module.
+ *
+ * This has to be called early in #VE handling. A "nested" #VE which
+ * occurs before this will raise a #DF and is not recoverable.
+ *
+ * The call retrieves the #VE info from the TDX module, which also
+ * clears the "#VE valid" flag. This must be done before anything else
+ * because any #VE that occurs while the valid flag is set will lead to
+ * #DF.
+ *
+ * Note, the TDX module treats virtual NMIs as inhibited if the #VE
+ * valid flag is set. It means that NMI=>#VE will not result in a #DF.
+ */
+ tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out);
+
+ /* Transfer the output parameters */
+ ve->exit_reason = out.rcx;
+ ve->exit_qual = out.rdx;
+ ve->gla = out.r8;
+ ve->gpa = out.r9;
+ ve->instr_len = lower_32_bits(out.r10);
+ ve->instr_info = upper_32_bits(out.r10);
+}
+
+/* Handle the user initiated #VE */
+static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return false;
+ }
+}
+
+/* Handle the kernel #VE */
+static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ return handle_halt();
+ case EXIT_REASON_MSR_READ:
+ return read_msr(regs);
+ case EXIT_REASON_MSR_WRITE:
+ return write_msr(regs);
+ case EXIT_REASON_CPUID:
+ return handle_cpuid(regs);
+ case EXIT_REASON_EPT_VIOLATION:
+ return handle_mmio(regs, ve);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return handle_io(regs, ve->exit_qual);
+ default:
+ pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
+ return false;
+ }
+}
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
+{
+ bool ret;
+
+ if (user_mode(regs))
+ ret = virt_exception_user(regs, ve);
+ else
+ ret = virt_exception_kernel(regs, ve);
+
+ /* After successful #VE handling, move the IP */
+ if (ret)
+ regs->ip += ve->instr_len;
+
+ return ret;
+}
+
+static bool tdx_tlb_flush_required(bool private)
+{
+ /*
+ * TDX guest is responsible for flushing TLB on private->shared
+ * transition. VMM is responsible for flushing on shared->private.
+ *
+ * The VMM _can't_ flush private addresses as it can't generate PAs
+ * with the guest's HKID. Shared memory isn't subject to integrity
+ * checking, i.e. the VMM doesn't need to flush for its own protection.
+ *
+ * There's no need to flush when converting from shared to private,
+ * as flushing is the VMM's responsibility in this case, e.g. it must
+ * flush to avoid integrity failures in the face of a buggy or
+ * malicious guest.
+ */
+ return !private;
+}
+
+static bool tdx_cache_flush_required(void)
+{
+ /*
+ * AMD SME/SEV can avoid cache flushing if HW enforces cache coherence.
+ * TDX doesn't have such capability.
+ *
+ * Flush cache unconditionally.
+ */
+ return true;
+}
+
+static bool try_accept_one(phys_addr_t *start, unsigned long len,
+ enum pg_level pg_level)
+{
+ unsigned long accept_size = page_level_size(pg_level);
+ u64 tdcall_rcx;
+ u8 page_size;
+
+ if (!IS_ALIGNED(*start, accept_size))
+ return false;
+
+ if (len < accept_size)
+ return false;
+
+ /*
+ * Pass the page physical address to the TDX module to accept the
+ * pending, private page.
+ *
+ * Bits 2:0 of RCX encode page size: 0 - 4K, 1 - 2M, 2 - 1G.
+ */
+ switch (pg_level) {
+ case PG_LEVEL_4K:
+ page_size = 0;
+ break;
+ case PG_LEVEL_2M:
+ page_size = 1;
+ break;
+ case PG_LEVEL_1G:
+ page_size = 2;
+ break;
+ default:
+ return false;
+ }
+
+ tdcall_rcx = *start | page_size;
+ if (__tdx_module_call(TDX_ACCEPT_PAGE, tdcall_rcx, 0, 0, 0, NULL))
+ return false;
+
+ *start += accept_size;
+ return true;
+}
+
+/*
+ * Inform the VMM of the guest's intent for this physical page: shared with
+ * the VMM or private to the guest. The VMM is expected to change its mapping
+ * of the page in response.
+ */
+static bool tdx_enc_status_changed(unsigned long vaddr, int numpages, bool enc)
+{
+ phys_addr_t start = __pa(vaddr);
+ phys_addr_t end = __pa(vaddr + numpages * PAGE_SIZE);
+
+ if (!enc) {
+ /* Set the shared (decrypted) bits: */
+ start |= cc_mkdec(0);
+ end |= cc_mkdec(0);
+ }
+
+ /*
+ * Notify the VMM about page mapping conversion. More info about ABI
+ * can be found in TDX Guest-Host-Communication Interface (GHCI),
+ * section "TDG.VP.VMCALL<MapGPA>"
+ */
+ if (_tdx_hypercall(TDVMCALL_MAP_GPA, start, end - start, 0, 0))
+ return false;
+
+ /* private->shared conversion requires only MapGPA call */
+ if (!enc)
+ return true;
+
+ /*
+ * For shared->private conversion, accept the page using
+ * TDX_ACCEPT_PAGE TDX module call.
+ */
+ while (start < end) {
+ unsigned long len = end - start;
+
+ /*
+ * Try larger accepts first. It gives chance to VMM to keep
+ * 1G/2M SEPT entries where possible and speeds up process by
+ * cutting number of hypercalls (if successful).
+ */
+
+ if (try_accept_one(&start, len, PG_LEVEL_1G))
+ continue;
+
+ if (try_accept_one(&start, len, PG_LEVEL_2M))
+ continue;
+
+ if (!try_accept_one(&start, len, PG_LEVEL_4K))
+ return false;
+ }
+
+ return true;
+}
+
+void __init tdx_early_init(void)
+{
+ u64 cc_mask;
+ u32 eax, sig[3];
+
+ cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]);
+
+ if (memcmp(TDX_IDENT, sig, sizeof(sig)))
+ return;
+
+ setup_force_cpu_cap(X86_FEATURE_TDX_GUEST);
+
+ cc_set_vendor(CC_VENDOR_INTEL);
+ cc_mask = get_cc_mask();
+ cc_set_mask(cc_mask);
+
+ /*
+ * All bits above GPA width are reserved and kernel treats shared bit
+ * as flag, not as part of physical address.
+ *
+ * Adjust physical mask to only cover valid GPA bits.
+ */
+ physical_mask &= cc_mask - 1;
+
+ x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
+ x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
+
+ pr_info("Guest detected\n");
+}
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a4c061fb7c6e..29b36e9e4e74 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -63,7 +63,7 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0
+.macro PUSH_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -73,7 +73,7 @@ For 32-bit we have the following conventions - kernel is built with
pushq %rsi /* pt_regs->si */
.endif
pushq \rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
+ pushq \rcx /* pt_regs->cx */
pushq \rax /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
@@ -99,6 +99,7 @@ For 32-bit we have the following conventions - kernel is built with
* well before they could be put to use in a speculative execution
* gadget.
*/
+ xorl %esi, %esi /* nospec si */
xorl %edx, %edx /* nospec dx */
xorl %ecx, %ecx /* nospec cx */
xorl %r8d, %r8d /* nospec r8 */
@@ -114,32 +115,24 @@ For 32-bit we have the following conventions - kernel is built with
.endm
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
- PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rcx=%rcx rax=%rax save_ret=0
+ PUSH_REGS rdx=\rdx, rcx=\rcx, rax=\rax, save_ret=\save_ret
CLEAR_REGS
.endm
-.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .if \skip_r11rcx
- popq %rsi
- .else
popq %r11
- .endif
popq %r10
popq %r9
popq %r8
popq %rax
- .if \skip_r11rcx
- popq %rsi
- .else
popq %rcx
- .endif
popq %rdx
popq %rsi
.if \pop_rdi
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4faac48ebec5..4300ba49b5ee 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -191,8 +191,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
- /* rcx and r11 are already restored (see code above) */
- POP_REGS pop_rdi=0 skip_r11rcx=1
+ POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
@@ -215,8 +214,13 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
+SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
swapgs
sysretq
+SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -318,6 +322,14 @@ SYM_CODE_END(ret_from_fork)
#endif
.endm
+/* Save all registers in pt_regs */
+SYM_CODE_START_LOCAL(push_and_clear_regs)
+ UNWIND_HINT_FUNC
+ PUSH_AND_CLEAR_REGS save_ret=1
+ ENCODE_FRAME_POINTER 8
+ RET
+SYM_CODE_END(push_and_clear_regs)
+
/**
* idtentry_body - Macro to emit code calling the C function
* @cfunc: C function to be called
@@ -325,7 +337,21 @@ SYM_CODE_END(ret_from_fork)
*/
.macro idtentry_body cfunc has_error_code:req
- call error_entry
+ call push_and_clear_regs
+ UNWIND_HINT_REGS
+
+ /*
+ * Call error_entry() and switch to the task stack if from userspace.
+ *
+ * When in XENPV, it is already in the task stack, and it can't fault
+ * for native_iret() nor native_load_gs_index() since XENPV uses its
+ * own pvops for IRET and load_gs_index(). And it doesn't need to
+ * switch the CR3. So it can skip invoking error_entry().
+ */
+ ALTERNATIVE "call error_entry; movq %rax, %rsp", \
+ "", X86_FEATURE_XENPV
+
+ ENCODE_FRAME_POINTER
UNWIND_HINT_REGS
movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
@@ -337,6 +363,9 @@ SYM_CODE_END(ret_from_fork)
call \cfunc
+ /* For some configurations \cfunc ends up being a noreturn. */
+ REACHABLE
+
jmp error_return
.endm
@@ -355,6 +384,7 @@ SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
ENDBR
ASM_CLAC
+ cld
.if \has_error_code == 0
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -423,6 +453,7 @@ SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS
ENDBR
ASM_CLAC
+ cld
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -479,6 +510,7 @@ SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS
ENDBR
ASM_CLAC
+ cld
/*
* If the entry is from userspace, switch stacks and treat it as
@@ -505,6 +537,7 @@ SYM_CODE_START(\asmsym)
call vc_switch_off_ist
movq %rax, %rsp /* Switch to new stack */
+ ENCODE_FRAME_POINTER
UNWIND_HINT_REGS
/* Update pt_regs */
@@ -541,6 +574,7 @@ SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS offset=8
ENDBR
ASM_CLAC
+ cld
/* paranoid_entry returns GS information for paranoid_exit in EBX. */
call paranoid_entry
@@ -866,7 +900,6 @@ SYM_CODE_END(xen_failsafe_callback)
*/
SYM_CODE_START_LOCAL(paranoid_entry)
UNWIND_HINT_FUNC
- cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -980,13 +1013,10 @@ SYM_CODE_START_LOCAL(paranoid_exit)
SYM_CODE_END(paranoid_exit)
/*
- * Save all registers in pt_regs, and switch GS if needed.
+ * Switch GS and CR3 if needed.
*/
SYM_CODE_START_LOCAL(error_entry)
UNWIND_HINT_FUNC
- cld
- PUSH_AND_CLEAR_REGS save_ret=1
- ENCODE_FRAME_POINTER 8
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -994,19 +1024,15 @@ SYM_CODE_START_LOCAL(error_entry)
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
*/
- SWAPGS
+ swapgs
FENCE_SWAPGS_USER_ENTRY
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
.Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
- popq %r12 /* save return addr in %12 */
- movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
- movq %rax, %rsp /* switch stack */
- ENCODE_FRAME_POINTER
- pushq %r12
RET
/*
@@ -1030,7 +1056,7 @@ SYM_CODE_START_LOCAL(error_entry)
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
- SWAPGS
+ swapgs
/*
* Issue an LFENCE to prevent GS speculation, regardless of whether it is a
@@ -1038,6 +1064,7 @@ SYM_CODE_START_LOCAL(error_entry)
*/
.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
+ leaq 8(%rsp), %rax /* return pt_regs pointer */
RET
.Lbstep_iret:
@@ -1050,7 +1077,7 @@ SYM_CODE_START_LOCAL(error_entry)
* We came from an IRET to user mode, so we have user
* gsbase and CR3. Switch to kernel gsbase and CR3:
*/
- SWAPGS
+ swapgs
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
@@ -1058,9 +1085,9 @@ SYM_CODE_START_LOCAL(error_entry)
* Pretend that the exception came from user mode: set up pt_regs
* as if we faulted immediately after IRET.
*/
- mov %rsp, %rdi
+ leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
call fixup_bad_iret
- mov %rax, %rsp
+ mov %rax, %rdi
jmp .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)
@@ -1123,6 +1150,7 @@ SYM_CODE_START(asm_exc_nmi)
*/
ASM_CLAC
+ cld
/* Use %rdx as our temp variable throughout */
pushq %rdx
@@ -1142,7 +1170,6 @@ SYM_CODE_START(asm_exc_nmi)
*/
swapgs
- cld
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 4fdb007cddbd..d1052742ad0c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -50,7 +50,7 @@ SYM_CODE_START(entry_SYSENTER_compat)
UNWIND_HINT_EMPTY
ENDBR
/* Interrupts are off on entry. */
- SWAPGS
+ swapgs
pushq %rax
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
@@ -83,32 +83,7 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
movl %eax, %eax
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
- xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp (will be overwritten) */
- xorl %ebp, %ebp /* nospec rbp */
- pushq $0 /* pt_regs->r12 = 0 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq $0 /* pt_regs->r13 = 0 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq $0 /* pt_regs->r14 = 0 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq $0 /* pt_regs->r15 = 0 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
UNWIND_HINT_REGS
cld
@@ -225,35 +200,7 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
movl %eax, %eax /* discard orig_ax high bits */
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
- pushq %rsi /* pt_regs->si */
- xorl %esi, %esi /* nospec si */
- pushq %rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
- pushq %rbp /* pt_regs->cx (stashed in bp) */
- xorl %ecx, %ecx /* nospec cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
- xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp (will be overwritten) */
- xorl %ebp, %ebp /* nospec rbp */
- pushq $0 /* pt_regs->r12 = 0 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq $0 /* pt_regs->r13 = 0 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq $0 /* pt_regs->r14 = 0 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq $0 /* pt_regs->r15 = 0 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rcx=%rbp rax=$-ENOSYS
UNWIND_HINT_REGS
movq %rsp, %rdi
@@ -297,6 +244,8 @@ sysret32_from_system_call:
* code. We zero R8-R10 to avoid info leaks.
*/
movq RSP-ORIG_RAX(%rsp), %rsp
+SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
/*
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
@@ -314,6 +263,9 @@ sysret32_from_system_call:
xorl %r10d, %r10d
swapgs
sysretl
+SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL)
+ ANNOTATE_NOENDBR
+ int3
SYM_CODE_END(entry_SYSCALL_compat)
/*
@@ -362,54 +314,25 @@ SYM_CODE_START(entry_INT80_compat)
/* switch to thread stack expects orig_ax and rdi to be pushed */
pushq %rax /* pt_regs->orig_ax */
- pushq %rdi /* pt_regs->di */
/* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/* In the Xen PV case we already run on the thread stack. */
ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
- movq %rsp, %rdi
+ movq %rsp, %rax
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- pushq 6*8(%rdi) /* regs->ss */
- pushq 5*8(%rdi) /* regs->rsp */
- pushq 4*8(%rdi) /* regs->eflags */
- pushq 3*8(%rdi) /* regs->cs */
- pushq 2*8(%rdi) /* regs->ip */
- pushq 1*8(%rdi) /* regs->orig_ax */
- pushq (%rdi) /* pt_regs->di */
+ pushq 5*8(%rax) /* regs->ss */
+ pushq 4*8(%rax) /* regs->rsp */
+ pushq 3*8(%rax) /* regs->eflags */
+ pushq 2*8(%rax) /* regs->cs */
+ pushq 1*8(%rax) /* regs->ip */
+ pushq 0*8(%rax) /* regs->orig_ax */
.Lint80_keep_stack:
- pushq %rsi /* pt_regs->si */
- xorl %esi, %esi /* nospec si */
- pushq %rdx /* pt_regs->dx */
- xorl %edx, %edx /* nospec dx */
- pushq %rcx /* pt_regs->cx */
- xorl %ecx, %ecx /* nospec cx */
- pushq $-ENOSYS /* pt_regs->ax */
- pushq %r8 /* pt_regs->r8 */
- xorl %r8d, %r8d /* nospec r8 */
- pushq %r9 /* pt_regs->r9 */
- xorl %r9d, %r9d /* nospec r9 */
- pushq %r10 /* pt_regs->r10*/
- xorl %r10d, %r10d /* nospec r10 */
- pushq %r11 /* pt_regs->r11 */
- xorl %r11d, %r11d /* nospec r11 */
- pushq %rbx /* pt_regs->rbx */
- xorl %ebx, %ebx /* nospec rbx */
- pushq %rbp /* pt_regs->rbp */
- xorl %ebp, %ebp /* nospec rbp */
- pushq %r12 /* pt_regs->r12 */
- xorl %r12d, %r12d /* nospec r12 */
- pushq %r13 /* pt_regs->r13 */
- xorl %r13d, %r13d /* nospec r13 */
- pushq %r14 /* pt_regs->r14 */
- xorl %r14d, %r14d /* nospec r14 */
- pushq %r15 /* pt_regs->r15 */
- xorl %r15d, %r15d /* nospec r15 */
-
+ PUSH_AND_CLEAR_REGS rax=$-ENOSYS
UNWIND_HINT_REGS
cld
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 693f8b9031fb..c2a8b76ae0bc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -148,6 +148,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 235a5794296a..1000d457c332 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -438,7 +438,7 @@ bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
static __init int vdso_setup(char *s)
{
vdso64_enabled = simple_strtoul(s, NULL, 0);
- return 0;
+ return 1;
}
__setup("vdso=", vdso_setup);
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index fd2ee9408e91..4af81df133ee 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -48,7 +48,7 @@ static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
XONLY;
#else
- EMULATE;
+ #error VSYSCALL config is broken
#endif
static int __init vsyscall_setup(char *str)
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig
index d6cdfe631674..09c56965750a 100644
--- a/arch/x86/events/Kconfig
+++ b/arch/x86/events/Kconfig
@@ -44,4 +44,12 @@ config PERF_EVENTS_AMD_UNCORE
To compile this driver as a module, choose M here: the
module will be called 'amd-uncore'.
+
+config PERF_EVENTS_AMD_BRS
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ bool "AMD Zen3 Branch Sampling support"
+ help
+ Enable AMD Zen3 branch sampling support (BRS) which samples up to
+ 16 consecutive taken branches in registers.
+
endmenu
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
index 6cbe38d5fd9d..b9f5d4610256 100644
--- a/arch/x86/events/amd/Makefile
+++ b/arch/x86/events/amd/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_CPU_SUP_AMD) += core.o
+obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o
obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o
obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
new file mode 100644
index 000000000000..bee8765a1e9b
--- /dev/null
+++ b/arch/x86/events/amd/brs.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Implement support for AMD Fam19h Branch Sampling feature
+ * Based on specifications published in AMD PPR Fam19 Model 01
+ *
+ * Copyright 2021 Google LLC
+ * Contributed by Stephane Eranian <eranian@google.com>
+ */
+#include <linux/kernel.h>
+#include <linux/jump_label.h>
+#include <asm/msr.h>
+#include <asm/cpufeature.h>
+
+#include "../perf_event.h"
+
+#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */
+
+/* Debug Extension Configuration register layout */
+union amd_debug_extn_cfg {
+ __u64 val;
+ struct {
+ __u64 rsvd0:2, /* reserved */
+ brsmen:1, /* branch sample enable */
+ rsvd4_3:2,/* reserved - must be 0x3 */
+ vb:1, /* valid branches recorded */
+ rsvd2:10, /* reserved */
+ msroff:4, /* index of next entry to write */
+ rsvd3:4, /* reserved */
+ pmc:3, /* #PMC holding the sampling event */
+ rsvd4:37; /* reserved */
+ };
+};
+
+static inline unsigned int brs_from(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx;
+}
+
+static inline unsigned int brs_to(int idx)
+{
+ return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
+}
+
+static inline void set_debug_extn_cfg(u64 val)
+{
+ /* bits[4:3] must always be set to 11b */
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
+}
+
+static inline u64 get_debug_extn_cfg(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, val);
+ return val;
+}
+
+static bool __init amd_brs_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return false;
+
+ switch (boot_cpu_data.x86) {
+ case 0x19: /* AMD Fam19h (Zen3) */
+ x86_pmu.lbr_nr = 16;
+
+ /* No hardware filtering supported */
+ x86_pmu.lbr_sel_map = NULL;
+ x86_pmu.lbr_sel_mask = 0;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Current BRS implementation does not support branch type or privilege level
+ * filtering. Therefore, this function simply enforces these limitations. No need for
+ * a br_sel_map. Software filtering is not supported because it would not correlate well
+ * with a sampling period.
+ */
+int amd_brs_setup_filter(struct perf_event *event)
+{
+ u64 type = event->attr.branch_sample_type;
+
+ /* No BRS support */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ /* Can only capture all branches, i.e., no filtering */
+ if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* tos = top of stack, i.e., last valid entry written */
+static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg)
+{
+ /*
+ * msroff: index of next entry to write so top-of-stack is one off
+ * if BRS is full then msroff is set back to 0.
+ */
+ return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1;
+}
+
+/*
+ * make sure we have a sane BRS offset to begin with
+ * especially with kexec
+ */
+void amd_brs_reset(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BRS))
+ return;
+
+ /*
+ * Reset config
+ */
+ set_debug_extn_cfg(0);
+
+ /*
+ * Mark first entry as poisoned
+ */
+ wrmsrl(brs_to(0), BRS_POISON);
+}
+
+int __init amd_brs_init(void)
+{
+ if (!amd_brs_detect())
+ return -EOPNOTSUPP;
+
+ pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr);
+
+ return 0;
+}
+
+void amd_brs_enable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Activate only on first user */
+ if (++cpuc->brs_active > 1)
+ return;
+
+ cfg.val = 0; /* reset all fields */
+ cfg.brsmen = 1; /* enable branch sampling */
+
+ /* Set enable bit */
+ set_debug_extn_cfg(cfg.val);
+}
+
+void amd_brs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_enable();
+}
+
+void amd_brs_disable(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /* Check if active (could be disabled via x86_pmu_disable_all()) */
+ if (!cpuc->brs_active)
+ return;
+
+ /* Only disable for last user */
+ if (--cpuc->brs_active)
+ return;
+
+ /*
+ * Clear the brsmen bit but preserve the others as they contain
+ * useful state such as vb and msroff
+ */
+ cfg.val = get_debug_extn_cfg();
+
+ /*
+ * When coming in on interrupt and BRS is full, then hw will have
+ * already stopped BRS, no need to issue wrmsr again
+ */
+ if (cfg.brsmen) {
+ cfg.brsmen = 0;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+void amd_brs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ if (cpuc->lbr_users)
+ amd_brs_disable();
+}
+
+static bool amd_brs_match_plm(struct perf_event *event, u64 to)
+{
+ int type = event->attr.branch_sample_type;
+ int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV;
+ int plm_u = PERF_SAMPLE_BRANCH_USER;
+
+ if (!(type & plm_k) && kernel_ip(to))
+ return 0;
+
+ if (!(type & plm_u) && !kernel_ip(to))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Caller must ensure amd_brs_inuse() is true before calling
+ * return:
+ */
+void amd_brs_drain(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_event *event = cpuc->events[0];
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ union amd_debug_extn_cfg cfg;
+ u32 i, nr = 0, num, tos, start;
+ u32 shift = 64 - boot_cpu_data.x86_virt_bits;
+
+ /*
+ * BRS event forced on PMC0,
+ * so check if there is an event.
+ * It is possible to have lbr_users > 0 but the event
+ * not yet scheduled due to long latency PMU irq
+ */
+ if (!event)
+ goto empty;
+
+ cfg.val = get_debug_extn_cfg();
+
+ /* Sanity check [0-x86_pmu.lbr_nr] */
+ if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr))
+ goto empty;
+
+ /* No valid branch */
+ if (cfg.vb == 0)
+ goto empty;
+
+ /*
+ * msr.off points to next entry to be written
+ * tos = most recent entry index = msr.off - 1
+ * BRS register buffer saturates, so we know we have
+ * start < tos and that we have to read from start to tos
+ */
+ start = 0;
+ tos = amd_brs_get_tos(&cfg);
+
+ num = tos - start + 1;
+
+ /*
+ * BRS is only one pass (saturation) from MSROFF to depth-1
+ * MSROFF wraps to zero when buffer is full
+ */
+ for (i = 0; i < num; i++) {
+ u32 brs_idx = tos - i;
+ u64 from, to;
+
+ rdmsrl(brs_to(brs_idx), to);
+
+ /* Entry does not belong to us (as marked by kernel) */
+ if (to == BRS_POISON)
+ break;
+
+ /*
+ * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved.
+ * Necessary to generate proper virtual addresses suitable for
+ * symbolization
+ */
+ to = (u64)(((s64)to << shift) >> shift);
+
+ if (!amd_brs_match_plm(event, to))
+ continue;
+
+ rdmsrl(brs_from(brs_idx), from);
+
+ perf_clear_branch_entry_bitfields(br+nr);
+
+ br[nr].from = from;
+ br[nr].to = to;
+
+ nr++;
+ }
+empty:
+ /* Record number of sampled branches */
+ cpuc->lbr_stack.nr = nr;
+}
+
+/*
+ * Poison most recent entry to prevent reuse by next task
+ * required because BRS entry are not tagged by PID
+ */
+static void amd_brs_poison_buffer(void)
+{
+ union amd_debug_extn_cfg cfg;
+ unsigned int idx;
+
+ /* Get current state */
+ cfg.val = get_debug_extn_cfg();
+
+ /* idx is most recently written entry */
+ idx = amd_brs_get_tos(&cfg);
+
+ /* Poison target of entry */
+ wrmsrl(brs_to(idx), BRS_POISON);
+}
+
+/*
+ * On context switch in, we need to make sure no samples from previous user
+ * are left in the BRS.
+ *
+ * On ctxswin, sched_in = true, called after the PMU has started
+ * On ctxswout, sched_in = false, called before the PMU is stopped
+ */
+void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /* no active users */
+ if (!cpuc->lbr_users)
+ return;
+
+ /*
+ * On context switch in, we need to ensure we do not use entries
+ * from previous BRS user on that CPU, so we poison the buffer as
+ * a faster way compared to resetting all entries.
+ */
+ if (sched_in)
+ amd_brs_poison_buffer();
+}
+
+/*
+ * called from ACPI processor_idle.c or acpi_pad.c
+ * with interrupts disabled
+ */
+void perf_amd_brs_lopwr_cb(bool lopwr_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union amd_debug_extn_cfg cfg;
+
+ /*
+ * on mwait in, we may end up in non C0 state.
+ * we must disable branch sampling to avoid holding the NMI
+ * for too long. We disable it in hardware but we
+ * keep the state in cpuc, so we can re-enable.
+ *
+ * The hardware will deliver the NMI if needed when brsmen cleared
+ */
+ if (cpuc->brs_active) {
+ cfg.val = get_debug_extn_cfg();
+ cfg.brsmen = !lopwr_in;
+ set_debug_extn_cfg(cfg.val);
+ }
+}
+
+DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb);
+
+void __init amd_brs_lopwr_init(void)
+{
+ static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+}
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 9687a8aef01c..9ac3718410ce 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/perf_event.h>
+#include <linux/jump_label.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -7,6 +8,7 @@
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <asm/apicdef.h>
+#include <asm/apic.h>
#include <asm/nmi.h>
#include "../perf_event.h"
@@ -18,6 +20,9 @@ static unsigned long perf_nmi_window;
#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
+/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */
+static u64 amd_pmu_global_cntr_mask __read_mostly;
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -325,8 +330,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
}
}
+#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
+static inline int amd_is_brs_event(struct perf_event *e)
+{
+ return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
+}
+
static int amd_core_hw_config(struct perf_event *event)
{
+ int ret = 0;
+
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -343,7 +356,66 @@ static int amd_core_hw_config(struct perf_event *event)
if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
event->hw.flags |= PERF_X86_EVENT_PAIR;
- return 0;
+ /*
+ * if branch stack is requested
+ */
+ if (has_branch_stack(event)) {
+ /*
+ * Due to interrupt holding, BRS is not recommended in
+ * counting mode.
+ */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ /*
+ * Due to the way BRS operates by holding the interrupt until
+ * lbr_nr entries have been captured, it does not make sense
+ * to allow sampling on BRS with an event that does not match
+ * what BRS is capturing, i.e., retired taken branches.
+ * Otherwise the correlation with the event's period is even
+ * more loose:
+ *
+ * With retired taken branch:
+ * Effective P = P + 16 + X
+ * With any other event:
+ * Effective P = P + Y + X
+ *
+ * Where X is the number of taken branches due to interrupt
+ * skid. Skid is large.
+ *
+ * Where Y is the occurences of the event while BRS is
+ * capturing the lbr_nr entries.
+ *
+ * By using retired taken branches, we limit the impact on the
+ * Y variable. We know it cannot be more than the depth of
+ * BRS.
+ */
+ if (!amd_is_brs_event(event))
+ return -EINVAL;
+
+ /*
+ * BRS implementation does not work with frequency mode
+ * reprogramming of the period.
+ */
+ if (event->attr.freq)
+ return -EINVAL;
+ /*
+ * The kernel subtracts BRS depth from period, so it must
+ * be big enough.
+ */
+ if (event->attr.sample_period <= x86_pmu.lbr_nr)
+ return -EINVAL;
+
+ /*
+ * Check if we can allow PERF_SAMPLE_BRANCH_STACK
+ */
+ ret = amd_brs_setup_filter(event);
+
+ /* only set in case of success */
+ if (!ret)
+ event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
+ }
+ return ret;
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -366,7 +438,7 @@ static int amd_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && get_ibs_caps())
return -ENOENT;
- if (has_branch_stack(event))
+ if (has_branch_stack(event) && !x86_pmu.lbr_nr)
return -EOPNOTSUPP;
ret = x86_pmu_hw_config(event);
@@ -510,6 +582,18 @@ static struct amd_nb *amd_alloc_nb(int cpu)
return nb;
}
+static void amd_pmu_cpu_reset(int cpu)
+{
+ if (x86_pmu.version < 2)
+ return;
+
+ /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0);
+
+ /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask);
+}
+
static int amd_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -555,6 +639,9 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
+
+ amd_brs_reset();
+ amd_pmu_cpu_reset(cpu);
}
static void amd_pmu_cpu_dead(int cpu)
@@ -574,8 +661,54 @@ static void amd_pmu_cpu_dead(int cpu)
cpuhw->amd_nb = NULL;
}
+
+ amd_pmu_cpu_reset(cpu);
+}
+
+static inline void amd_pmu_set_global_ctl(u64 ctl)
+{
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
}
+static inline u64 amd_pmu_get_global_status(void)
+{
+ u64 status;
+
+ /* PerfCntrGlobalStatus is read-only */
+ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status);
+
+ return status & amd_pmu_global_cntr_mask;
+}
+
+static inline void amd_pmu_ack_global_status(u64 status)
+{
+ /*
+ * PerfCntrGlobalStatus is read-only but an overflow acknowledgment
+ * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr
+ * clears the same bit in PerfCntrGlobalStatus
+ */
+
+ /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */
+ status &= amd_pmu_global_cntr_mask;
+ wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status);
+}
+
+static bool amd_pmu_test_overflow_topbit(int idx)
+{
+ u64 counter;
+
+ rdmsrl(x86_pmu_event_addr(idx), counter);
+
+ return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1));
+}
+
+static bool amd_pmu_test_overflow_status(int idx)
+{
+ return amd_pmu_get_global_status() & BIT_ULL(idx);
+}
+
+DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit);
+
/*
* When a PMC counter overflows, an NMI is used to process the event and
* reset the counter. NMI latency can result in the counter being updated
@@ -588,7 +721,6 @@ static void amd_pmu_cpu_dead(int cpu)
static void amd_pmu_wait_on_overflow(int idx)
{
unsigned int i;
- u64 counter;
/*
* Wait for the counter to be reset if it has overflowed. This loop
@@ -596,8 +728,7 @@ static void amd_pmu_wait_on_overflow(int idx)
* forever...
*/
for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
- rdmsrl(x86_pmu_event_addr(idx), counter);
- if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
+ if (!static_call(amd_pmu_test_overflow)(idx))
break;
/* Might be in IRQ context, so can't sleep */
@@ -605,13 +736,11 @@ static void amd_pmu_wait_on_overflow(int idx)
}
}
-static void amd_pmu_disable_all(void)
+static void amd_pmu_check_overflow(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
- x86_pmu_disable_all();
-
/*
* This shouldn't be called from NMI context, but add a safeguard here
* to return, since if we're in NMI context we can't wait for an NMI
@@ -634,6 +763,47 @@ static void amd_pmu_disable_all(void)
}
}
+static void amd_pmu_enable_event(struct perf_event *event)
+{
+ x86_pmu_enable_event(event);
+}
+
+static void amd_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int idx;
+
+ amd_brs_enable_all();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ /* only activate events which are marked as active */
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ amd_pmu_enable_event(cpuc->events[idx]);
+ }
+}
+
+static void amd_pmu_v2_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * Testing cpu_hw_events.enabled should be skipped in this case unlike
+ * in x86_pmu_enable_event().
+ *
+ * Since cpu_hw_events.enabled is set only after returning from
+ * x86_pmu_start(), the PMCs must be programmed and kept ready.
+ * Counting starts only after x86_pmu_enable_all() is called.
+ */
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+static void amd_pmu_v2_enable_all(int added)
+{
+ amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask);
+}
+
static void amd_pmu_disable_event(struct perf_event *event)
{
x86_pmu_disable_event(event);
@@ -651,6 +821,32 @@ static void amd_pmu_disable_event(struct perf_event *event)
amd_pmu_wait_on_overflow(event->hw.idx);
}
+static void amd_pmu_disable_all(void)
+{
+ amd_brs_disable_all();
+ x86_pmu_disable_all();
+ amd_pmu_check_overflow();
+}
+
+static void amd_pmu_v2_disable_all(void)
+{
+ /* Disable all PMCs */
+ amd_pmu_set_global_ctl(0);
+ amd_pmu_check_overflow();
+}
+
+static void amd_pmu_add_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ amd_pmu_brs_add(event);
+}
+
+static void amd_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ amd_pmu_brs_del(event);
+}
+
/*
* Because of NMI latency, if multiple PMC counters are active or other sources
* of NMIs are received, the perf NMI handler can handle one or more overflowed
@@ -669,13 +865,8 @@ static void amd_pmu_disable_event(struct perf_event *event)
* handled a counter. When an un-handled NMI is received, it will be claimed
* only if arriving within that window.
*/
-static int amd_pmu_handle_irq(struct pt_regs *regs)
+static inline int amd_pmu_adjust_nmi_window(int handled)
{
- int handled;
-
- /* Process any counter overflows */
- handled = x86_pmu_handle_irq(regs);
-
/*
* If a counter was handled, record a timestamp such that un-handled
* NMIs will be claimed if arriving within that window.
@@ -692,6 +883,113 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return NMI_HANDLED;
}
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int handled;
+ int pmu_enabled;
+
+ /*
+ * Save the PMU state.
+ * It needs to be restored when leaving the handler.
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ /* stop everything (includes BRS) */
+ amd_pmu_disable_all();
+
+ /* Drain BRS is in use (could be inactive) */
+ if (cpuc->lbr_users)
+ amd_brs_drain();
+
+ /* Process any counter overflows */
+ handled = x86_pmu_handle_irq(regs);
+
+ cpuc->enabled = pmu_enabled;
+ if (pmu_enabled)
+ amd_pmu_enable_all(0);
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
+static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_sample_data data;
+ struct hw_perf_event *hwc;
+ struct perf_event *event;
+ int handled = 0, idx;
+ u64 status, mask;
+ bool pmu_enabled;
+
+ /*
+ * Save the PMU state as it needs to be restored when leaving the
+ * handler
+ */
+ pmu_enabled = cpuc->enabled;
+ cpuc->enabled = 0;
+
+ /* Stop counting */
+ amd_pmu_v2_disable_all();
+
+ status = amd_pmu_get_global_status();
+
+ /* Check if any overflows are pending */
+ if (!status)
+ goto done;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+ x86_perf_event_update(event);
+ mask = BIT_ULL(idx);
+
+ if (!(status & mask))
+ continue;
+
+ /* Event overflow */
+ handled++;
+ perf_sample_data_init(&data, 0, hwc->last_period);
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+
+ if (perf_event_overflow(event, &data, regs))
+ x86_pmu_stop(event, 0);
+
+ status &= ~mask;
+ }
+
+ /*
+ * It should never be the case that some overflows are not handled as
+ * the corresponding PMCs are expected to be inactive according to the
+ * active_mask
+ */
+ WARN_ON(status > 0);
+
+ /* Clear overflow bits */
+ amd_pmu_ack_global_status(~status);
+
+ /*
+ * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT
+ * PMI entry is not set by the local APIC when a PMC overflow occurs
+ */
+ inc_irq_stat(apic_perf_irqs);
+
+done:
+ cpuc->enabled = pmu_enabled;
+
+ /* Resume counting only if PMU is active */
+ if (pmu_enabled)
+ amd_pmu_v2_enable_all(0);
+
+ return amd_pmu_adjust_nmi_window(handled);
+}
+
static struct event_constraint *
amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -897,6 +1195,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
--cpuc->n_pair;
}
+/*
+ * Because of the way BRS operates with an inactive and active phases, and
+ * the link to one counter, it is not possible to have two events using BRS
+ * scheduled at the same time. There would be an issue with enforcing the
+ * period of each one and given that the BRS saturates, it would not be possible
+ * to guarantee correlated content for all events. Therefore, in situations
+ * where multiple events want to use BRS, the kernel enforces mutual exclusion.
+ * Exclusion is enforced by chosing only one counter for events using BRS.
+ * The event scheduling logic will then automatically multiplex the
+ * events and ensure that at most one event is actively using BRS.
+ *
+ * The BRS counter could be any counter, but there is no constraint on Fam19h,
+ * therefore all counters are equal and thus we pick the first one: PMC0
+ */
+static struct event_constraint amd_fam19h_brs_cntr0_constraint =
+ EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK);
+
+static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint =
+ __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR);
+
+static struct event_constraint *
+amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ bool has_brs = has_amd_brs(hwc);
+
+ /*
+ * In case BRS is used with an event requiring a counter pair,
+ * the kernel allows it but only on counter 0 & 1 to enforce
+ * multiplexing requiring to protect BRS in case of multiple
+ * BRS users
+ */
+ if (amd_is_pair_event_code(hwc)) {
+ return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint
+ : &pair_constraint;
+ }
+
+ if (has_brs)
+ return &amd_fam19h_brs_cntr0_constraint;
+
+ return &unconstrained;
+}
+
+
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
@@ -905,12 +1248,31 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
+static void amd_pmu_sched_task(struct perf_event_context *ctx,
+ bool sched_in)
+{
+ if (sched_in && x86_pmu.lbr_nr)
+ amd_pmu_brs_sched_task(ctx, sched_in);
+}
+
+static u64 amd_pmu_limit_period(struct perf_event *event, u64 left)
+{
+ /*
+ * Decrease period by the depth of the BRS feature to get the last N
+ * taken branches and approximate the desired period
+ */
+ if (has_branch_stack(event) && left > x86_pmu.lbr_nr)
+ left -= x86_pmu.lbr_nr;
+
+ return left;
+}
+
static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = amd_pmu_handle_irq,
.disable_all = amd_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_event,
.disable = amd_pmu_disable_event,
.hw_config = amd_pmu_hw_config,
.schedule_events = x86_schedule_events,
@@ -920,6 +1282,8 @@ static __initconst const struct x86_pmu amd_pmu = {
.event_map = amd_pmu_event_map,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
.num_counters = AMD64_NUM_COUNTERS,
+ .add = amd_pmu_add_event,
+ .del = amd_pmu_del_event,
.cntval_bits = 48,
.cntval_mask = (1ULL << 48) - 1,
.apic = 1,
@@ -938,8 +1302,55 @@ static __initconst const struct x86_pmu amd_pmu = {
.amd_nb_constraints = 1,
};
+static ssize_t branches_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr);
+}
+
+static DEVICE_ATTR_RO(branches);
+
+static struct attribute *amd_pmu_brs_attrs[] = {
+ &dev_attr_branches.attr,
+ NULL,
+};
+
+static umode_t
+amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return x86_pmu.lbr_nr ? attr->mode : 0;
+}
+
+static struct attribute_group group_caps_amd_brs = {
+ .name = "caps",
+ .attrs = amd_pmu_brs_attrs,
+ .is_visible = amd_brs_is_visible,
+};
+
+EVENT_ATTR_STR(branch-brs, amd_branch_brs,
+ "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n");
+
+static struct attribute *amd_brs_events_attrs[] = {
+ EVENT_PTR(amd_branch_brs),
+ NULL,
+};
+
+static struct attribute_group group_events_amd_brs = {
+ .name = "events",
+ .attrs = amd_brs_events_attrs,
+ .is_visible = amd_brs_is_visible,
+};
+
+static const struct attribute_group *amd_attr_update[] = {
+ &group_caps_amd_brs,
+ &group_events_amd_brs,
+ NULL,
+};
+
static int __init amd_core_pmu_init(void)
{
+ union cpuid_0x80000022_ebx ebx;
u64 even_ctr_mask = 0ULL;
int i;
@@ -957,6 +1368,27 @@ static int __init amd_core_pmu_init(void)
x86_pmu.eventsel = MSR_F15H_PERF_CTL;
x86_pmu.perfctr = MSR_F15H_PERF_CTR;
x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
+
+ /* Check for Performance Monitoring v2 support */
+ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+
+ /* Update PMU version for later usage */
+ x86_pmu.version = 2;
+
+ /* Find the number of available Core PMCs */
+ x86_pmu.num_counters = ebx.split.num_core_pmc;
+
+ amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1;
+
+ /* Update PMC handling functions */
+ x86_pmu.enable_all = amd_pmu_v2_enable_all;
+ x86_pmu.disable_all = amd_pmu_v2_disable_all;
+ x86_pmu.enable = amd_pmu_v2_enable_event;
+ x86_pmu.handle_irq = amd_pmu_v2_handle_irq;
+ static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status);
+ }
+
/*
* AMD Core perfctr has separate MSRs for the NB events, see
* the amd/uncore.c driver.
@@ -989,6 +1421,23 @@ static int __init amd_core_pmu_init(void)
x86_pmu.flags |= PMU_FL_PAIR;
}
+ /*
+ * BRS requires special event constraints and flushing on ctxsw.
+ */
+ if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) {
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h;
+ x86_pmu.sched_task = amd_pmu_sched_task;
+ x86_pmu.limit_period = amd_pmu_limit_period;
+ /*
+ * put_event_constraints callback same as Fam17h, set above
+ */
+
+ /* branch sampling must be stopped when entering low power */
+ amd_brs_lopwr_init();
+ }
+
+ x86_pmu.attr_update = amd_attr_update;
+
pr_cont("core perfctr, ");
return 0;
}
@@ -1023,6 +1472,24 @@ __init int amd_pmu_init(void)
return 0;
}
+static inline void amd_pmu_reload_virt(void)
+{
+ if (x86_pmu.version >= 2) {
+ /*
+ * Clear global enable bits, reprogram the PERF_CTL
+ * registers with updated perf_ctr_virt_mask and then
+ * set global enable bits once again
+ */
+ amd_pmu_v2_disable_all();
+ amd_pmu_enable_all(0);
+ amd_pmu_v2_enable_all(0);
+ return;
+ }
+
+ amd_pmu_disable_all();
+ amd_pmu_enable_all(0);
+}
+
void amd_pmu_enable_virt(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1030,8 +1497,7 @@ void amd_pmu_enable_virt(void)
cpuc->perf_ctr_virt_mask = 0;
/* Reload all events */
- amd_pmu_disable_all();
- x86_pmu_enable_all(0);
+ amd_pmu_reload_virt();
}
EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
@@ -1048,7 +1514,6 @@ void amd_pmu_disable_virt(void)
cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
/* Reload all events */
- amd_pmu_disable_all();
- x86_pmu_enable_all(0);
+ amd_pmu_reload_virt();
}
EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 9739019d4b67..c251bc44c088 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -94,10 +94,6 @@ struct perf_ibs {
unsigned int fetch_ignore_if_zero_rip : 1;
struct cpu_perf_ibs __percpu *pcpu;
- struct attribute **format_attrs;
- struct attribute_group format_group;
- const struct attribute_group *attr_groups[2];
-
u64 (*get_count)(u64 config);
};
@@ -304,6 +300,16 @@ static int perf_ibs_init(struct perf_event *event)
hwc->config_base = perf_ibs->msr;
hwc->config = config;
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack. Setting _EARLY flag
+ * makes sure we unwind call-stack before perf sample rip is set to
+ * IbsOpRip.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
+
return 0;
}
@@ -518,16 +524,118 @@ static void perf_ibs_del(struct perf_event *event, int flags)
static void perf_ibs_read(struct perf_event *event) { }
+/*
+ * We need to initialize with empty group if all attributes in the
+ * group are dynamic.
+ */
+static struct attribute *attrs_empty[] = {
+ NULL,
+};
+
+static struct attribute_group empty_format_group = {
+ .name = "format",
+ .attrs = attrs_empty,
+};
+
+static struct attribute_group empty_caps_group = {
+ .name = "caps",
+ .attrs = attrs_empty,
+};
+
+static const struct attribute_group *empty_attr_groups[] = {
+ &empty_format_group,
+ &empty_caps_group,
+ NULL,
+};
+
PMU_FORMAT_ATTR(rand_en, "config:57");
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
+PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
+PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
-static struct attribute *ibs_fetch_format_attrs[] = {
+static umode_t
+zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
+}
+
+static struct attribute *rand_en_attrs[] = {
&format_attr_rand_en.attr,
NULL,
};
-static struct attribute *ibs_op_format_attrs[] = {
- NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+static struct attribute *fetch_l3missonly_attrs[] = {
+ &fetch_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute *zen4_ibs_extensions_attrs[] = {
+ &zen4_ibs_extensions.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_rand_en = {
+ .name = "format",
+ .attrs = rand_en_attrs,
+};
+
+static struct attribute_group group_fetch_l3missonly = {
+ .name = "format",
+ .attrs = fetch_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static struct attribute_group group_zen4_ibs_extensions = {
+ .name = "caps",
+ .attrs = zen4_ibs_extensions_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static const struct attribute_group *fetch_attr_groups[] = {
+ &group_rand_en,
+ &empty_caps_group,
+ NULL,
+};
+
+static const struct attribute_group *fetch_attr_update[] = {
+ &group_fetch_l3missonly,
+ &group_zen4_ibs_extensions,
+ NULL,
+};
+
+static umode_t
+cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
+}
+
+static struct attribute *cnt_ctl_attrs[] = {
+ &format_attr_cnt_ctl.attr,
+ NULL,
+};
+
+static struct attribute *op_l3missonly_attrs[] = {
+ &op_l3missonly.attr.attr,
+ NULL,
+};
+
+static struct attribute_group group_cnt_ctl = {
+ .name = "format",
+ .attrs = cnt_ctl_attrs,
+ .is_visible = cnt_ctl_is_visible,
+};
+
+static struct attribute_group group_op_l3missonly = {
+ .name = "format",
+ .attrs = op_l3missonly_attrs,
+ .is_visible = zen4_ibs_extensions_is_visible,
+};
+
+static const struct attribute_group *op_attr_update[] = {
+ &group_cnt_ctl,
+ &group_op_l3missonly,
+ &group_zen4_ibs_extensions,
NULL,
};
@@ -551,7 +659,6 @@ static struct perf_ibs perf_ibs_fetch = {
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
- .format_attrs = ibs_fetch_format_attrs,
.get_count = get_ibs_fetch_count,
};
@@ -577,7 +684,6 @@ static struct perf_ibs perf_ibs_op = {
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
- .format_attrs = ibs_op_format_attrs,
.get_count = get_ibs_op_count,
};
@@ -687,6 +793,14 @@ fail:
data.raw = &raw;
}
+ /*
+ * rip recorded by IbsOpRip will not be consistent with rsp and rbp
+ * recorded as part of interrupt regs. Thus we need to use rip from
+ * interrupt regs while unwinding call stack.
+ */
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ data.callchain = perf_callchain(event, iregs);
+
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle) {
@@ -739,17 +853,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
perf_ibs->pcpu = pcpu;
- /* register attributes */
- if (perf_ibs->format_attrs[0]) {
- memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
- perf_ibs->format_group.name = "format";
- perf_ibs->format_group.attrs = perf_ibs->format_attrs;
-
- memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
- perf_ibs->attr_groups[0] = &perf_ibs->format_group;
- perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
- }
-
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
@@ -759,10 +862,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
return ret;
}
-static __init void perf_event_ibs_init(void)
+static __init int perf_ibs_fetch_init(void)
{
- struct attribute **attr = ibs_op_format_attrs;
-
/*
* Some chips fail to reset the fetch count when it is written; instead
* they need a 0-1 transition of IbsFetchEn.
@@ -773,12 +874,19 @@ static __init void perf_event_ibs_init(void)
if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10)
perf_ibs_fetch.fetch_ignore_if_zero_rip = 1;
- perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY;
+
+ perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups;
+ perf_ibs_fetch.pmu.attr_update = fetch_attr_update;
- if (ibs_caps & IBS_CAPS_OPCNT) {
+ return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+}
+
+static __init int perf_ibs_op_init(void)
+{
+ if (ibs_caps & IBS_CAPS_OPCNT)
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
- *attr++ = &format_attr_cnt_ctl.attr;
- }
if (ibs_caps & IBS_CAPS_OPCNTEXT) {
perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK;
@@ -786,15 +894,52 @@ static __init void perf_event_ibs_init(void)
perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK;
}
- perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
+
+ perf_ibs_op.pmu.attr_groups = empty_attr_groups;
+ perf_ibs_op.pmu.attr_update = op_attr_update;
+
+ return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+}
+
+static __init int perf_event_ibs_init(void)
+{
+ int ret;
+
+ ret = perf_ibs_fetch_init();
+ if (ret)
+ return ret;
+
+ ret = perf_ibs_op_init();
+ if (ret)
+ goto err_op;
+
+ ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+ if (ret)
+ goto err_nmi;
- register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+ return 0;
+
+err_nmi:
+ perf_pmu_unregister(&perf_ibs_op.pmu);
+ free_percpu(perf_ibs_op.pcpu);
+ perf_ibs_op.pcpu = NULL;
+err_op:
+ perf_pmu_unregister(&perf_ibs_fetch.pmu);
+ free_percpu(perf_ibs_fetch.pcpu);
+ perf_ibs_fetch.pcpu = NULL;
+
+ return ret;
}
#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-static __init void perf_event_ibs_init(void) { }
+static __init int perf_event_ibs_init(void)
+{
+ return 0;
+}
#endif
@@ -1064,9 +1209,7 @@ static __init int amd_ibs_init(void)
x86_pmu_amd_ibs_starting_cpu,
x86_pmu_amd_ibs_dying_cpu);
- perf_event_ibs_init();
-
- return 0;
+ return perf_event_ibs_init();
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index eef816fc216d..30788894124f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu)
if (hwc->state & PERF_HES_ARCH)
continue;
+ /*
+ * if cpuc->enabled = 0, then no wrmsr as
+ * per x86_pmu_enable_event()
+ */
x86_pmu_start(event, PERF_EF_RELOAD);
}
cpuc->n_added = 0;
@@ -1704,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
+
+ if (has_branch_stack(event))
+ data.br_stack = &cpuc->lbr_stack;
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
@@ -1837,7 +1845,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha
/* string trumps id */
if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
+ return sprintf(page, "%s\n", pmu_attr->event_str);
return x86_pmu.events_sysfs_show(page, config);
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e88791b420ee..955ae91c56dc 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -302,7 +302,7 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
- INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
EVENT_EXTRA_END
@@ -5536,7 +5536,11 @@ static void intel_pmu_check_event_constraints(struct event_constraint *event_con
/* Disabled fixed counters which are not in CPUID */
c->idxmsk64 &= intel_ctrl;
- if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ /*
+ * Don't extend the pseudo-encoding to the
+ * generic counters
+ */
+ if (!use_fixed_pseudo_encoding(c->code))
c->idxmsk64 |= (1ULL << num_counters) - 1;
}
c->idxmsk64 &=
@@ -6212,6 +6216,9 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_RAPTORLAKE:
+ case INTEL_FAM6_RAPTORLAKE_P:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index c6262b154c3a..8ec23f47fee9 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL
+ * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -51,49 +51,50 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL,ADL
+ * ICL,TGL,RKL,ADL,RPL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL
+ * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
+ * RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL
+ * TGL,TNT,RKL,ADL,RPL,SPR
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL,ADL
+ * KBL,CML,ICL,TGL,RKL,ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL
+ * ADL,RPL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL,ADL
+ * TNT,RKL,ADL,RPL
* Scope: Package (physical package)
*
*/
@@ -674,12 +675,16 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index fe1742c4ca49..13179f31fe10 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -769,6 +769,7 @@ void intel_pmu_lbr_disable_all(void)
void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
u64 tos = intel_pmu_lbr_tos();
int i;
@@ -784,15 +785,11 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].mispred = 0;
- cpuc->lbr_entries[i].predicted = 0;
- cpuc->lbr_entries[i].in_tx = 0;
- cpuc->lbr_entries[i].abort = 0;
- cpuc->lbr_entries[i].cycles = 0;
- cpuc->lbr_entries[i].type = 0;
- cpuc->lbr_entries[i].reserved = 0;
+ perf_clear_branch_entry_bitfields(br);
+
+ br->from = msr_lastbranch.from;
+ br->to = msr_lastbranch.to;
+ br++;
}
cpuc->lbr_stack.nr = i;
cpuc->lbr_stack.hw_idx = tos;
@@ -807,6 +804,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
+ struct perf_branch_entry *br = cpuc->lbr_entries;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
@@ -878,15 +876,14 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (abort && x86_pmu.lbr_double_abort && out > 0)
out--;
- cpuc->lbr_entries[out].from = from;
- cpuc->lbr_entries[out].to = to;
- cpuc->lbr_entries[out].mispred = mis;
- cpuc->lbr_entries[out].predicted = pred;
- cpuc->lbr_entries[out].in_tx = in_tx;
- cpuc->lbr_entries[out].abort = abort;
- cpuc->lbr_entries[out].cycles = cycles;
- cpuc->lbr_entries[out].type = 0;
- cpuc->lbr_entries[out].reserved = 0;
+ perf_clear_branch_entry_bitfields(br+out);
+ br[out].from = from;
+ br[out].to = to;
+ br[out].mispred = mis;
+ br[out].predicted = pred;
+ br[out].in_tx = in_tx;
+ br[out].abort = abort;
+ br[out].cycles = cycles;
out++;
}
cpuc->lbr_stack.nr = out;
@@ -951,6 +948,8 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
to = rdlbr_to(i, lbr);
info = rdlbr_info(i, lbr);
+ perf_clear_branch_entry_bitfields(e);
+
e->from = from;
e->to = to;
e->mispred = get_lbr_mispred(info);
@@ -959,7 +958,6 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
e->type = get_lbr_br_type(info);
- e->reserved = 0;
}
cpuc->lbr_stack.nr = i;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index e497da9bf427..db6c31bca809 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1828,6 +1828,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index f698a55bde81..ce440011cc4e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -79,6 +79,43 @@
#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
+#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614
+#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617
+#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618
+#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B
+#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C
+#define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700
+#define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702
+#define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701
+#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703
+#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704
+#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705
+#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706
+#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707
+#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708
+#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709
+#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a
+#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b
+#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715
+#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716
+#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717
+#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718
+#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719
+#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A
+#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B
+#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C
+#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728
+#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729
+#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A
+
+
+#define IMC_UNCORE_DEV(a) \
+{ \
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \
+}
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -845,242 +882,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = {
};
static const struct pci_device_id snb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(SNB),
{ /* end: all zeroes */ },
};
static const struct pci_device_id ivb_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(IVB),
+ IMC_UNCORE_DEV(IVB_E3),
{ /* end: all zeroes */ },
};
static const struct pci_device_id hsw_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(HSW),
+ IMC_UNCORE_DEV(HSW_U),
{ /* end: all zeroes */ },
};
static const struct pci_device_id bdw_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(BDW),
{ /* end: all zeroes */ },
};
static const struct pci_device_id skl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(SKL_Y),
+ IMC_UNCORE_DEV(SKL_U),
+ IMC_UNCORE_DEV(SKL_HD),
+ IMC_UNCORE_DEV(SKL_HQ),
+ IMC_UNCORE_DEV(SKL_SD),
+ IMC_UNCORE_DEV(SKL_SQ),
+ IMC_UNCORE_DEV(SKL_E3),
+ IMC_UNCORE_DEV(KBL_Y),
+ IMC_UNCORE_DEV(KBL_U),
+ IMC_UNCORE_DEV(KBL_UQ),
+ IMC_UNCORE_DEV(KBL_SD),
+ IMC_UNCORE_DEV(KBL_SQ),
+ IMC_UNCORE_DEV(KBL_HQ),
+ IMC_UNCORE_DEV(KBL_WQ),
+ IMC_UNCORE_DEV(CFL_2U),
+ IMC_UNCORE_DEV(CFL_4U),
+ IMC_UNCORE_DEV(CFL_4H),
+ IMC_UNCORE_DEV(CFL_6H),
+ IMC_UNCORE_DEV(CFL_2S_D),
+ IMC_UNCORE_DEV(CFL_4S_D),
+ IMC_UNCORE_DEV(CFL_6S_D),
+ IMC_UNCORE_DEV(CFL_8S_D),
+ IMC_UNCORE_DEV(CFL_4S_W),
+ IMC_UNCORE_DEV(CFL_6S_W),
+ IMC_UNCORE_DEV(CFL_8S_W),
+ IMC_UNCORE_DEV(CFL_4S_S),
+ IMC_UNCORE_DEV(CFL_6S_S),
+ IMC_UNCORE_DEV(CFL_8S_S),
+ IMC_UNCORE_DEV(AML_YD),
+ IMC_UNCORE_DEV(AML_YQ),
+ IMC_UNCORE_DEV(WHL_UQ),
+ IMC_UNCORE_DEV(WHL_4_UQ),
+ IMC_UNCORE_DEV(WHL_UD),
+ IMC_UNCORE_DEV(CML_H1),
+ IMC_UNCORE_DEV(CML_H2),
+ IMC_UNCORE_DEV(CML_H3),
+ IMC_UNCORE_DEV(CML_U1),
+ IMC_UNCORE_DEV(CML_U2),
+ IMC_UNCORE_DEV(CML_U3),
+ IMC_UNCORE_DEV(CML_S1),
+ IMC_UNCORE_DEV(CML_S2),
+ IMC_UNCORE_DEV(CML_S3),
+ IMC_UNCORE_DEV(CML_S4),
+ IMC_UNCORE_DEV(CML_S5),
{ /* end: all zeroes */ },
};
static const struct pci_device_id icl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_RKL_2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(ICL_U),
+ IMC_UNCORE_DEV(ICL_U2),
+ IMC_UNCORE_DEV(RKL_1),
+ IMC_UNCORE_DEV(RKL_2),
{ /* end: all zeroes */ },
};
@@ -1322,90 +1197,57 @@ void nhm_uncore_cpu_init(void)
/* Tiger Lake MMIO uncore support */
static const struct pci_device_id tgl_uncore_pci_ids[] = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_U4_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
+ IMC_UNCORE_DEV(TGL_U1),
+ IMC_UNCORE_DEV(TGL_U2),
+ IMC_UNCORE_DEV(TGL_U3),
+ IMC_UNCORE_DEV(TGL_U4),
+ IMC_UNCORE_DEV(TGL_H),
+ IMC_UNCORE_DEV(ADL_1),
+ IMC_UNCORE_DEV(ADL_2),
+ IMC_UNCORE_DEV(ADL_3),
+ IMC_UNCORE_DEV(ADL_4),
+ IMC_UNCORE_DEV(ADL_5),
+ IMC_UNCORE_DEV(ADL_6),
+ IMC_UNCORE_DEV(ADL_7),
+ IMC_UNCORE_DEV(ADL_8),
+ IMC_UNCORE_DEV(ADL_9),
+ IMC_UNCORE_DEV(ADL_10),
+ IMC_UNCORE_DEV(ADL_11),
+ IMC_UNCORE_DEV(ADL_12),
+ IMC_UNCORE_DEV(ADL_13),
+ IMC_UNCORE_DEV(ADL_14),
+ IMC_UNCORE_DEV(ADL_15),
+ IMC_UNCORE_DEV(ADL_16),
+ IMC_UNCORE_DEV(ADL_17),
+ IMC_UNCORE_DEV(ADL_18),
+ IMC_UNCORE_DEV(ADL_19),
+ IMC_UNCORE_DEV(ADL_20),
+ IMC_UNCORE_DEV(ADL_21),
+ IMC_UNCORE_DEV(RPL_1),
+ IMC_UNCORE_DEV(RPL_2),
+ IMC_UNCORE_DEV(RPL_3),
+ IMC_UNCORE_DEV(RPL_4),
+ IMC_UNCORE_DEV(RPL_5),
+ IMC_UNCORE_DEV(RPL_6),
+ IMC_UNCORE_DEV(RPL_7),
+ IMC_UNCORE_DEV(RPL_8),
+ IMC_UNCORE_DEV(RPL_9),
+ IMC_UNCORE_DEV(RPL_10),
+ IMC_UNCORE_DEV(RPL_11),
+ IMC_UNCORE_DEV(RPL_12),
+ IMC_UNCORE_DEV(RPL_13),
+ IMC_UNCORE_DEV(RPL_14),
+ IMC_UNCORE_DEV(RPL_15),
+ IMC_UNCORE_DEV(RPL_16),
+ IMC_UNCORE_DEV(RPL_17),
+ IMC_UNCORE_DEV(RPL_18),
+ IMC_UNCORE_DEV(RPL_19),
+ IMC_UNCORE_DEV(RPL_20),
+ IMC_UNCORE_DEV(RPL_21),
+ IMC_UNCORE_DEV(RPL_22),
+ IMC_UNCORE_DEV(RPL_23),
+ IMC_UNCORE_DEV(RPL_24),
+ IMC_UNCORE_DEV(RPL_25),
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 96c775abe31f..ac542f98c070 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -103,6 +103,9 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ROCKETLAKE:
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
+ case INTEL_FAM6_ALDERLAKE_N:
+ case INTEL_FAM6_RAPTORLAKE:
+ case INTEL_FAM6_RAPTORLAKE_P:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 150261d929b9..21a5482bcf84 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-
-#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
-#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */
-#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
-#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
-#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
-#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */
+
+#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */
+#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */
+#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */
+#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */
+#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
+#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -325,6 +326,8 @@ struct cpu_hw_events {
* AMD specific bits
*/
struct amd_nb *amd_nb;
+ int brs_active; /* BRS is enabled */
+
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
int n_pair; /* Large increment events */
@@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
+static inline bool has_amd_brs(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_AMD_BRS;
+}
+
static inline bool is_counter_pair(struct hw_perf_event *hwc)
{
return hwc->flags & PERF_X86_EVENT_PAIR;
@@ -1211,6 +1219,75 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
int amd_pmu_init(void);
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+int amd_brs_init(void);
+void amd_brs_disable(void);
+void amd_brs_enable(void);
+void amd_brs_enable_all(void);
+void amd_brs_disable_all(void);
+void amd_brs_drain(void);
+void amd_brs_lopwr_init(void);
+void amd_brs_disable_all(void);
+int amd_brs_setup_filter(struct perf_event *event);
+void amd_brs_reset(void);
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ perf_sched_cb_inc(event->ctx->pmu);
+ cpuc->lbr_users++;
+ /*
+ * No need to reset BRS because it is reset
+ * on brs_enable() and it is saturating
+ */
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+
+ perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
+#else
+static inline int amd_brs_init(void)
+{
+ return 0;
+}
+static inline void amd_brs_disable(void) {}
+static inline void amd_brs_enable(void) {}
+static inline void amd_brs_drain(void) {}
+static inline void amd_brs_lopwr_init(void) {}
+static inline void amd_brs_disable_all(void) {}
+static inline int amd_brs_setup_filter(struct perf_event *event)
+{
+ return 0;
+}
+static inline void amd_brs_reset(void) {}
+
+static inline void amd_pmu_brs_add(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_del(struct perf_event *event)
+{
+}
+
+static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+#endif
+
#else /* CONFIG_CPU_SUP_AMD */
static inline int amd_pmu_init(void)
@@ -1218,6 +1295,22 @@ static inline int amd_pmu_init(void)
return 0;
}
+static inline int amd_brs_init(void)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void amd_brs_drain(void)
+{
+}
+
+static inline void amd_brs_enable_all(void)
+{
+}
+
+static inline void amd_brs_disable_all(void)
+{
+}
#endif /* CONFIG_CPU_SUP_AMD */
static inline int is_pebs_pt(struct perf_event *event)
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 8e4d0391ff6c..e481056698de 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -5,7 +5,5 @@
obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o
-obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
-
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
deleted file mode 100644
index 9bd15241fadb..000000000000
--- a/arch/x86/ia32/ia32_aout.c
+++ /dev/null
@@ -1,325 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * a.out loader for x86-64
- *
- * Copyright (C) 1991, 1992, 1996 Linus Torvalds
- * Hacked together by Andi Kleen
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/jiffies.h>
-#include <linux/perf_event.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/user32.h>
-#include <asm/ia32.h>
-
-#undef WARN_OLD
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file *);
-
-static struct linux_binfmt aout_format = {
- .module = THIS_MODULE,
- .load_binary = load_aout_binary,
- .load_shlib = load_aout_library,
-};
-
-static int set_brk(unsigned long start, unsigned long end)
-{
- start = PAGE_ALIGN(start);
- end = PAGE_ALIGN(end);
- if (end <= start)
- return 0;
- return vm_brk(start, end - start);
-}
-
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
-{
- u32 __user *argv, *envp, *sp;
- int argc = bprm->argc, envc = bprm->envc;
-
- sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
- sp -= envc+1;
- envp = sp;
- sp -= argc+1;
- argv = sp;
- put_user((unsigned long) envp, --sp);
- put_user((unsigned long) argv, --sp);
- put_user(argc, --sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, argv++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-- > 0) {
- char c;
-
- put_user((u32)(unsigned long)p, envp++);
- do {
- get_user(c, p++);
- } while (c);
- }
- put_user(0, envp);
- current->mm->env_end = (unsigned long) p;
- return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-static int load_aout_binary(struct linux_binprm *bprm)
-{
- unsigned long error, fd_offset, rlim;
- struct pt_regs *regs = current_pt_regs();
- struct exec ex;
- int retval;
-
- ex = *((struct exec *) bprm->buf); /* exec-header */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
- N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
- N_TRSIZE(ex) || N_DRSIZE(ex) ||
- i_size_read(file_inode(bprm->file)) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- return -ENOEXEC;
- }
-
- fd_offset = N_TXTOFF(ex);
-
- /* Check initial limits. This avoids letting people circumvent
- * size limits imposed on them by creating programs with large
- * arrays in the data or bss.
- */
- rlim = rlimit(RLIMIT_DATA);
- if (rlim >= RLIM_INFINITY)
- rlim = ~0;
- if (ex.a_data + ex.a_bss > rlim)
- return -ENOMEM;
-
- /* Flush all traces of the currently running executable */
- retval = begin_new_exec(bprm);
- if (retval)
- return retval;
-
- /* OK, This is the point of no return */
- set_personality(PER_LINUX);
- set_personality_ia32(false);
-
- setup_new_exec(bprm);
-
- regs->cs = __USER32_CS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
- regs->r13 = regs->r14 = regs->r15 = 0;
-
- current->mm->end_code = ex.a_text +
- (current->mm->start_code = N_TXTADDR(ex));
- current->mm->end_data = ex.a_data +
- (current->mm->start_data = N_DATADDR(ex));
- current->mm->brk = ex.a_bss +
- (current->mm->start_brk = N_BSSADDR(ex));
-
- retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0)
- return retval;
-
- if (N_MAGIC(ex) == OMAGIC) {
- unsigned long text_addr, map_size;
-
- text_addr = N_TXTADDR(ex);
- map_size = ex.a_text+ex.a_data;
-
- error = vm_brk(text_addr & PAGE_MASK, map_size);
-
- if (error)
- return error;
-
- error = read_code(bprm->file, text_addr, 32,
- ex.a_text + ex.a_data);
- if ((signed long)error < 0)
- return error;
- } else {
-#ifdef WARN_OLD
- static unsigned long error_time, error_time2;
- if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
- (N_MAGIC(ex) != NMAGIC) &&
- time_after(jiffies, error_time2 + 5*HZ)) {
- printk(KERN_NOTICE "executable not page aligned\n");
- error_time2 = jiffies;
- }
-
- if ((fd_offset & ~PAGE_MASK) != 0 &&
- time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "fd_offset is not page aligned. Please convert "
- "program: %pD\n",
- bprm->file);
- error_time = jiffies;
- }
-#endif
-
- if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
- error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
- if (error)
- return error;
-
- read_code(bprm->file, N_TXTADDR(ex), fd_offset,
- ex.a_text+ex.a_data);
- goto beyond_if;
- }
-
- error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
- PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
- fd_offset);
-
- if (error != N_TXTADDR(ex))
- return error;
-
- error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
- fd_offset + ex.a_text);
- if (error != N_DATADDR(ex))
- return error;
- }
-
-beyond_if:
- error = set_brk(current->mm->start_brk, current->mm->brk);
- if (error)
- return error;
-
- set_binfmt(&aout_format);
-
- current->mm->start_stack =
- (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
- /* start thread */
- loadsegment(fs, 0);
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
- load_gs_index(0);
- (regs)->ip = ex.a_entry;
- (regs)->sp = current->mm->start_stack;
- (regs)->flags = 0x200;
- (regs)->cs = __USER32_CS;
- (regs)->ss = __USER32_DS;
- regs->r8 = regs->r9 = regs->r10 = regs->r11 =
- regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
- return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
- unsigned long bss, start_addr, len, error;
- int retval;
- struct exec ex;
- loff_t pos = 0;
-
- retval = -ENOEXEC;
- error = kernel_read(file, &ex, sizeof(ex), &pos);
- if (error != sizeof(ex))
- goto out;
-
- /* We come in here for the regular a.out style of shared libraries */
- if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
- N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
- i_size_read(file_inode(file)) <
- ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
- goto out;
- }
-
- if (N_FLAGS(ex))
- goto out;
-
- /* For QMAGIC, the starting address is 0x20 into the page. We mask
- this off to get the starting address for the page */
-
- start_addr = ex.a_entry & 0xfffff000;
-
- if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-#ifdef WARN_OLD
- static unsigned long error_time;
- if (time_after(jiffies, error_time + 5*HZ)) {
- printk(KERN_WARNING
- "N_TXTOFF is not page aligned. Please convert "
- "library: %pD\n",
- file);
- error_time = jiffies;
- }
-#endif
- retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
- if (retval)
- goto out;
-
- read_code(file, start_addr, N_TXTOFF(ex),
- ex.a_text + ex.a_data);
- retval = 0;
- goto out;
- }
- /* Now use mmap to map the library into memory. */
- error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
- N_TXTOFF(ex));
- retval = error;
- if (error != start_addr)
- goto out;
-
- len = PAGE_ALIGN(ex.a_text + ex.a_data);
- bss = ex.a_text + ex.a_data + ex.a_bss;
- if (bss > len) {
- retval = vm_brk(start_addr + len, bss - len);
- if (retval)
- goto out;
- }
- retval = 0;
-out:
- return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
- register_binfmt(&aout_format);
- return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
- unregister_binfmt(&aout_format);
-}
-
-module_init(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
index 9aff97f0de7f..d937c55e717e 100644
--- a/arch/x86/include/asm/acenv.h
+++ b/arch/x86/include/asm/acenv.h
@@ -13,7 +13,19 @@
/* Asm macros */
-#define ACPI_FLUSH_CPU_CACHE() wbinvd()
+/*
+ * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states.
+ * It is required to prevent data loss.
+ *
+ * While running inside virtual machine, the kernel can bypass cache flushing.
+ * Changing sleep state in a virtual machine doesn't affect the host system
+ * sleep state and cannot lead to data loss.
+ */
+#define ACPI_FLUSH_CPU_CACHE() \
+do { \
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \
+ wbinvd(); \
+} while (0)
int __acpi_acquire_global_lock(unsigned int *lock);
int __acpi_release_global_lock(unsigned int *lock);
diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h
index 46e1df45efc0..aabdbb5ab920 100644
--- a/arch/x86/include/asm/amd-ibs.h
+++ b/arch/x86/include/asm/amd-ibs.h
@@ -49,7 +49,7 @@ union ibs_op_ctl {
};
};
-/* MSR 0xc0011035: IBS Op Data 2 */
+/* MSR 0xc0011035: IBS Op Data 1 */
union ibs_op_data {
__u64 val;
struct {
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 00d1a400b7a1..ed0eaf65c437 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -16,7 +16,6 @@ extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
extern bool early_is_amd_nb(u32 value);
extern struct resource *amd_get_mmconfig_range(struct resource *res);
-extern int amd_cache_northbridges(void);
extern void amd_flush_garts(void);
extern int amd_numa_init(void);
extern int amd_get_subcaches(int);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 48067af94678..bd8ae0a7010a 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -328,6 +328,8 @@ struct apic {
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+ /* wakeup secondary CPU using 64-bit wakeup point */
+ int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
void (*inquire_remote_apic)(int apicid);
@@ -488,6 +490,11 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
}
+#ifdef CONFIG_X86_64
+typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
+extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
+#endif
+
extern int default_apic_id_valid(u32 apicid);
extern int default_acpi_madt_oem_check(char *, char *);
extern void default_setup_apic_routing(void);
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 5716f22f81ac..92035eb3afee 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -95,12 +95,6 @@
#define APIC_LVTTHMR 0x330
#define APIC_LVTPC 0x340
#define APIC_LVT0 0x350
-#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
-#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
-#define SET_APIC_TIMER_BASE(x) (((x) << 18))
-#define APIC_TIMER_BASE_CLKIN 0x0
-#define APIC_TIMER_BASE_TMBASE 0x1
-#define APIC_TIMER_BASE_DIV 0x2
#define APIC_LVT_TIMER_ONESHOT (0 << 17)
#define APIC_LVT_TIMER_PERIODIC (1 << 17)
#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17)
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index c878fed3056f..fbcfec4dc4cc 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -154,24 +154,24 @@
# define DEFINE_EXTABLE_TYPE_REG \
".macro extable_type_reg type:req reg:req\n" \
- ".set found, 0\n" \
- ".set regnr, 0\n" \
+ ".set .Lfound, 0\n" \
+ ".set .Lregnr, 0\n" \
".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
".ifc \\reg, %%\\rs\n" \
- ".set found, found+1\n" \
- ".long \\type + (regnr << 8)\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
".endif\n" \
- ".set regnr, regnr+1\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
".endr\n" \
- ".set regnr, 0\n" \
+ ".set .Lregnr, 0\n" \
".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
".ifc \\reg, %%\\rs\n" \
- ".set found, found+1\n" \
- ".long \\type + (regnr << 8)\n" \
+ ".set .Lfound, .Lfound+1\n" \
+ ".long \\type + (.Lregnr << 8)\n" \
".endif\n" \
- ".set regnr, regnr+1\n" \
+ ".set .Lregnr, .Lregnr+1\n" \
".endr\n" \
- ".if (found != 1)\n" \
+ ".if (.Lfound != 1)\n" \
".error \"extable_type_reg: bad register argument\"\n" \
".endif\n" \
".endm\n"
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 981fe923a59f..53e9b0620d96 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -74,6 +74,7 @@ static void sanitize_boot_params(struct boot_params *boot_params)
BOOT_PARAM_PRESERVE(hdr),
BOOT_PARAM_PRESERVE(e820_table),
BOOT_PARAM_PRESERVE(eddbuf),
+ BOOT_PARAM_PRESERVE(cc_blob_address),
};
memset(&scratch, 0, sizeof(scratch));
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 4d20a293c6fd..a3ec87d198ac 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -18,7 +18,7 @@
#ifdef CONFIG_X86_32
# define __BUG_REL(val) ".long " __stringify(val)
#else
-# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
+# define __BUG_REL(val) ".long " __stringify(val) " - ."
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
@@ -78,9 +78,9 @@ do { \
*/
#define __WARN_FLAGS(flags) \
do { \
- __auto_type f = BUGFLAG_WARNING|(flags); \
+ __auto_type __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
- _BUG_FLAGS(ASM_UD2, f, ASM_REACHABLE); \
+ _BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
instrumentation_end(); \
} while (0)
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 0a7fe0321613..215f5a65790f 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
#define arch_cmpxchg64_local(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
(unsigned long long)(n)))
+#define arch_try_cmpxchg64(ptr, po, n) \
+ __try_cmpxchg64((ptr), (unsigned long long *)(po), \
+ (unsigned long long)(n))
#endif
static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
@@ -70,6 +73,24 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
return prev;
}
+static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+{
+ bool success;
+ u64 old = *pold;
+ asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]"
+ CC_SET(z)
+ : CC_OUT(z) (success),
+ [ptr] "+m" (*ptr),
+ "+A" (old)
+ : "b" ((u32)new),
+ "c" ((u32)(new >> 32))
+ : "memory");
+
+ if (unlikely(!success))
+ *pold = old;
+ return success;
+}
+
#ifndef CONFIG_X86_CMPXCHG64
/*
* Building a kernel capable running on 80386 and 80486. It may be necessary
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 072e5459fe2f..250187ac8248 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
arch_cmpxchg_local((ptr), (o), (n)); \
})
+#define arch_try_cmpxchg64(ptr, po, n) \
+({ \
+ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
+ arch_try_cmpxchg((ptr), (po), (n)); \
+})
+
#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 7516e4199b3c..20fd0acd7d80 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -28,15 +28,13 @@ typedef u16 compat_ipc_pid_t;
typedef __kernel_fsid_t compat_fsid_t;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 86e5e4e26fcb..8cbf623f0ecf 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action);
#endif
#endif
+extern void ap_init_aperfmperf(void);
+
int mwait_usable(const struct cpuinfo_x86 *);
unsigned int x86_family(unsigned int sig);
@@ -43,14 +45,12 @@ unsigned int x86_model(unsigned int sig);
unsigned int x86_stepping(unsigned int sig);
#ifdef CONFIG_CPU_SUP_INTEL
extern void __init sld_setup(struct cpuinfo_x86 *c);
-extern void switch_to_sld(unsigned long tifn);
extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
extern bool handle_guest_split_lock(unsigned long ip);
extern void handle_bus_lock(struct pt_regs *regs);
u8 get_this_hybrid_cpu_type(void);
#else
static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
-static inline void switch_to_sld(unsigned long tifn) {}
static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
return false;
@@ -76,4 +76,22 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {}
extern __noendbr void cet_disable(void);
+struct ucode_cpu_info;
+
+int intel_cpu_collect_info(struct ucode_cpu_info *uci);
+
+static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
+{
+ if (s1 != s2)
+ return false;
+
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
+
+ /* ... or they intersect. */
+ return p1 & p2;
+}
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index dd5ea1bdf04c..75efc4c6f076 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -143,7 +143,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
-static inline struct entry_stack *cpu_entry_stack(int cpu)
+static __always_inline struct entry_stack *cpu_entry_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
}
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1261842d006c..66d3e3b1d24d 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -34,14 +34,17 @@ enum cpuid_leafs
CPUID_8000_001F_EAX,
};
+#define X86_CAP_FMT_NUM "%d:%d"
+#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31)
+
#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
#define X86_CAP_FMT "%s"
#define x86_cap_flag(flag) x86_cap_flags[flag]
#else
-#define X86_CAP_FMT "%d:%d"
-#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
+#define X86_CAP_FMT X86_CAP_FMT_NUM
+#define x86_cap_flag x86_cap_flag_num
#endif
/*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 73e643ae94b6..21bb78dfd41d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -201,7 +201,7 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-/* FREE! ( 7*32+10) */
+#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */
@@ -211,7 +211,7 @@
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-/* FREE! ( 7*32+20) */
+#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -238,6 +238,7 @@
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
+#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -315,6 +316,7 @@
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
new file mode 100644
index 000000000000..70b2db18165e
--- /dev/null
+++ b/arch/x86/include/asm/cpuid.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * CPUID-related helpers/definitions
+ *
+ * Derived from arch/x86/kvm/cpuid.c
+ */
+
+#ifndef _ASM_X86_CPUID_H
+#define _ASM_X86_CPUID_H
+
+static __always_inline bool cpuid_function_is_indexed(u32 function)
+{
+ switch (function) {
+ case 4:
+ case 7:
+ case 0xb:
+ case 0xd:
+ case 0xf:
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x17:
+ case 0x18:
+ case 0x1d:
+ case 0x1e:
+ case 0x1f:
+ case 0x8000001d:
+ return true;
+ }
+
+ return false;
+}
+
+#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 1231d63f836d..36369e76cc63 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -10,12 +10,6 @@
* cpu_feature_enabled().
*/
-#ifdef CONFIG_X86_SMAP
-# define DISABLE_SMAP 0
-#else
-# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31))
-#endif
-
#ifdef CONFIG_X86_UMIP
# define DISABLE_UMIP 0
#else
@@ -68,6 +62,12 @@
# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+# define DISABLE_TDX_GUEST 0
+#else
+# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -79,8 +79,8 @@
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
-#define DISABLED_MASK8 0
-#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX)
+#define DISABLED_MASK8 (DISABLE_TDX_GUEST)
+#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 0
#define DISABLED_MASK12 0
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 98938a68251c..bed74a0f2932 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status)
runtime), \
func, __VA_ARGS__))
+#define efi_dxe_call(func, ...) \
+ (efi_is_native() \
+ ? efi_dxe_table->func(__VA_ARGS__) \
+ : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__))
+
#else /* CONFIG_EFI_MIXED */
static inline bool efi_is_64bit(void)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 29fea180a665..cb0ff1055ab1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -116,7 +116,7 @@ extern unsigned int vdso32_enabled;
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -128,6 +128,7 @@ do { \
pr_reg[7] = regs->ds; \
pr_reg[8] = regs->es; \
pr_reg[9] = regs->fs; \
+ savesegment(gs, pr_reg[10]); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
pr_reg[13] = regs->cs; \
@@ -136,18 +137,6 @@ do { \
pr_reg[16] = regs->ss; \
} while (0);
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- pr_reg[10] = get_user_gs(regs); \
-} while (0);
-
-#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
-do { \
- ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
- savesegment(gs, pr_reg[10]); \
-} while (0);
-
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 43184640b579..674ed46d3ced 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -10,7 +10,7 @@
#include <asm/fpu/api.h>
/* Check that the stack and regs on entry from user mode are sane. */
-static __always_inline void arch_check_user_regs(struct pt_regs *regs)
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) {
/*
@@ -42,7 +42,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs)
WARN_ON_ONCE(regs != task_pt_regs(current));
}
}
-#define arch_check_user_regs arch_check_user_regs
+#define arch_enter_from_user_mode arch_enter_from_user_mode
static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
unsigned long ti_work)
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index c83b3020350a..6b0f31fb53f7 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
}
/* prctl */
-struct task_struct;
-extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2);
+extern long fpu_xstate_prctl(int option, unsigned long arg2);
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/arch/x86/include/asm/fpu/internal.h
+++ /dev/null
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 032e020853aa..731ee7cc40a5 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -26,6 +26,7 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/fixmap.h>
+#include <asm/pgtable_areas.h>
/* declarations for highmem.c */
extern unsigned long highstart_pfn, highend_pfn;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 7924f27f5c8b..72184b0b2219 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
+#endif
+
/* Device interrupts common/spurious */
DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 048b6d5aff50..def6ca121111 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -26,6 +26,7 @@
* _G - parts with extra graphics on
* _X - regular server parts
* _D - micro server parts
+ * _N,_P - other mobile parts
*
* Historical OPTDIFFs:
*
@@ -107,8 +108,10 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
+#define INTEL_FAM6_ALDERLAKE_N 0xBE
#define INTEL_FAM6_RAPTORLAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE_P 0xBA
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index f6d91ecb8026..1870b99c3356 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -44,6 +44,7 @@
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
+#include <asm/shared/io.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -210,8 +211,6 @@ void __iomem *ioremap(resource_size_t offset, unsigned long size);
extern void iounmap(volatile void __iomem *addr);
#define iounmap iounmap
-extern void set_iounmap_nonlazy(void);
-
#ifdef __KERNEL__
void memcpy_fromio(void *, const volatile void __iomem *, size_t);
@@ -258,37 +257,23 @@ static inline void slow_down_io(void)
#endif
#define BUILDIO(bwl, bw, type) \
-static inline void out##bwl(unsigned type value, int port) \
-{ \
- asm volatile("out" #bwl " %" #bw "0, %w1" \
- : : "a"(value), "Nd"(port)); \
-} \
- \
-static inline unsigned type in##bwl(int port) \
-{ \
- unsigned type value; \
- asm volatile("in" #bwl " %w1, %" #bw "0" \
- : "=a"(value) : "Nd"(port)); \
- return value; \
-} \
- \
-static inline void out##bwl##_p(unsigned type value, int port) \
+static inline void out##bwl##_p(type value, u16 port) \
{ \
out##bwl(value, port); \
slow_down_io(); \
} \
\
-static inline unsigned type in##bwl##_p(int port) \
+static inline type in##bwl##_p(u16 port) \
{ \
- unsigned type value = in##bwl(port); \
+ type value = in##bwl(port); \
slow_down_io(); \
return value; \
} \
\
-static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \
{ \
if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
- unsigned type *value = (unsigned type *)addr; \
+ type *value = (type *)addr; \
while (count) { \
out##bwl(*value, port); \
value++; \
@@ -301,10 +286,10 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
} \
} \
\
-static inline void ins##bwl(int port, void *addr, unsigned long count) \
+static inline void ins##bwl(u16 port, void *addr, unsigned long count) \
{ \
if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
- unsigned type *value = (unsigned type *)addr; \
+ type *value = (type *)addr; \
while (count) { \
*value = in##bwl(port); \
value++; \
@@ -317,13 +302,11 @@ static inline void ins##bwl(int port, void *addr, unsigned long count) \
} \
}
-BUILDIO(b, b, char)
-BUILDIO(w, w, short)
-BUILDIO(l, , int)
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
+#undef BUILDIO
-#define inb inb
-#define inw inw
-#define inl inl
#define inb_p inb_p
#define inw_p inw_p
#define inl_p inl_p
@@ -331,9 +314,6 @@ BUILDIO(l, , int)
#define insw insw
#define insl insl
-#define outb outb
-#define outw outw
-#define outl outl
#define outb_p outb_p
#define outw_p outw_p
#define outl_p outl_p
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 111104d1c2cd..7793e52d6237 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -137,14 +137,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
if (!arch_irqs_disabled_flags(flags))
arch_local_irq_enable();
}
-#else
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_XEN_PV
-#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
-#else
-#define SWAPGS swapgs
-#endif
-#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 0449b125d27f..071572e23d3a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -20,7 +20,7 @@
_ASM_PTR "%c0 + %c1 - .\n\t" \
".popsection \n\t"
-#ifdef CONFIG_STACK_VALIDATION
+#ifdef CONFIG_HAVE_JUMP_LABEL_HACK
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
@@ -34,7 +34,7 @@ l_yes:
return true;
}
-#else
+#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */
static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
@@ -48,7 +48,7 @@ l_yes:
return true;
}
-#endif /* STACK_VALIDATION */
+#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */
static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3c368b639c04..1a6d7e3f6c32 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -118,6 +118,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
+KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d23e80a56eb8..4ff36610af6a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -974,12 +974,10 @@ enum hv_tsc_page_status {
HV_TSC_PAGE_UNSET = 0,
/* TSC page MSR was written by the guest, update pending */
HV_TSC_PAGE_GUEST_CHANGED,
- /* TSC page MSR was written by KVM userspace, update pending */
+ /* TSC page update was triggered from the host side */
HV_TSC_PAGE_HOST_CHANGED,
/* TSC page was properly set up and is currently active */
HV_TSC_PAGE_SET,
- /* TSC page is currently being updated and therefore is inactive */
- HV_TSC_PAGE_UPDATING,
/* TSC page was set up with an inaccessible GPA */
HV_TSC_PAGE_BROKEN,
};
@@ -1052,6 +1050,7 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_X2APIC,
APICV_INHIBIT_REASON_BLOCKIRQ,
APICV_INHIBIT_REASON_ABSENT,
+ APICV_INHIBIT_REASON_SEV,
};
struct kvm_arch {
@@ -1485,6 +1484,7 @@ struct kvm_x86_ops {
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
+ void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
@@ -1585,8 +1585,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
#define kvm_arch_pmi_in_guest(vcpu) \
((vcpu) && (vcpu)->arch.handling_intr_from_guest)
-int kvm_mmu_module_init(void);
-void kvm_mmu_module_exit(void);
+void kvm_mmu_x86_module_init(void);
+int kvm_mmu_vendor_module_init(void);
+void kvm_mmu_vendor_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 56935ebb1dfe..57bc74e112f2 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -7,6 +7,8 @@
#include <linux/interrupt.h>
#include <uapi/asm/kvm_para.h>
+#include <asm/tdx.h>
+
#ifdef CONFIG_KVM_GUEST
bool kvm_check_and_clear_guest_paused(void);
#else
@@ -32,6 +34,10 @@ static inline bool kvm_check_and_clear_guest_paused(void)
static inline long kvm_hypercall0(unsigned int nr)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, 0, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr)
@@ -42,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr)
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, 0, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1)
@@ -53,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
unsigned long p2)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, 0, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2)
@@ -64,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, 0);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3)
@@ -76,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p4)
{
long ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return tdx_kvm_hypercall(nr, p1, p2, p3, p4);
+
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index e2c6f433ed10..88ceaf3648b3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -49,9 +49,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages,
void __init mem_encrypt_free_decrypted_mem(void);
-/* Architecture __weak replacement functions */
-void __init mem_encrypt_init(void);
-
void __init sev_es_init_vc_handling(void);
#define __bss_decrypted __section(".bss..decrypted")
@@ -89,6 +86,9 @@ static inline void mem_encrypt_free_decrypted_mem(void) { }
#endif /* CONFIG_AMD_MEM_ENCRYPT */
+/* Architecture __weak replacement functions */
+void __init mem_encrypt_init(void);
+
/*
* The __sme_pa() and __sme_pa_nodebug() macros are meant for use when
* writing to or comparing values from the cr3 register. Having the
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index d6bfdfb0f0af..0c3d3440fe27 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -131,10 +131,12 @@ extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool initrd_gone;
+void microcode_bsp_resume(void);
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
+static inline void microcode_bsp_resume(void) { }
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 27516046117a..b8d40ddeab00 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -141,7 +141,7 @@ do { \
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
- lazy_load_gs(0); \
+ loadsegment(gs, 0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/arch/x86/include/asm/mmx.h
+++ /dev/null
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index b85147d75626..d71c7e8b738d 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -12,14 +12,17 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
/* Structs and defines for the X86 specific MSI message format */
typedef struct x86_msi_data {
- u32 vector : 8,
- delivery_mode : 3,
- dest_mode_logical : 1,
- reserved : 2,
- active_low : 1,
- is_level : 1;
-
- u32 dmar_subhandle;
+ union {
+ struct {
+ u32 vector : 8,
+ delivery_mode : 3,
+ dest_mode_logical : 1,
+ reserved : 2,
+ active_low : 1,
+ is_level : 1;
+ };
+ u32 dmar_subhandle;
+ };
} __attribute__ ((packed)) arch_msi_msg_data_t;
#define arch_msi_msg_data x86_msi_data
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 0eb90d21049e..14154b8b1720 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -76,6 +76,8 @@
/* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */
#define MSR_IA32_CORE_CAPS 0x000000cf
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2
+#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT)
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5
#define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT)
@@ -128,9 +130,9 @@
#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */
-/* SRBDS support */
#define MSR_IA32_MCU_OPT_CTRL 0x00000123
-#define RNGDS_MITG_DIS BIT(0)
+#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
+#define RTM_ALLOW BIT(1) /* TSX development mode */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -154,6 +156,11 @@
#define MSR_IA32_POWER_CTL 0x000001fc
#define MSR_IA32_POWER_CTL_BIT_EE 19
+/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
+#define MSR_INTEGRITY_CAPS 0x000002d9
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
+#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
+
#define MSR_LBR_NHM_FROM 0x00000680
#define MSR_LBR_NHM_TO 0x000006c0
#define MSR_LBR_CORE_FROM 0x00000040
@@ -502,8 +509,10 @@
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
+#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
@@ -524,6 +533,11 @@
#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
+/* AMD Performance Counter Global Status and Control MSRs */
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300
+#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
+#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
@@ -688,6 +702,10 @@
#define MSR_IA32_PERF_CTL 0x00000199
#define INTEL_PERF_CTL_MASK 0xffff
+/* AMD Branch Sampling configuration */
+#define MSR_AMD_DBG_EXTN_CFG 0xc000010f
+#define MSR_AMD_SAMP_BR_FROM 0xc0010300
+
#define MSR_IA32_MPERF 0x000000e7
#define MSR_IA32_APERF 0x000000e8
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index d42e6c6b47b1..65ec1965cd28 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -10,16 +10,7 @@
#include <asm/errno.h>
#include <asm/cpumask.h>
#include <uapi/asm/msr.h>
-
-struct msr {
- union {
- struct {
- u32 l;
- u32 h;
- };
- u64 q;
- };
-};
+#include <asm/shared/msr.h>
struct msr_info {
u32 msr_no;
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 1cb9c17a4cb4..5c5f1e56c404 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -47,6 +47,7 @@ struct nmiaction {
#define register_nmi_handler(t, fn, fg, n, init...) \
({ \
static struct nmiaction init fn##_na = { \
+ .list = LIST_HEAD_INIT(fn##_na.list), \
.handler = (fn), \
.name = (n), \
.flags = (fg), \
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index e9c86299b835..baa70451b8df 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -16,7 +16,7 @@ extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
-static inline unsigned long __phys_addr_nodebug(unsigned long x)
+static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index a0627dfae541..1307cd689d2a 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,15 @@ struct irq_routing_table {
struct irq_info slots[];
} __attribute__((packed));
+struct irt_routing_table {
+ u32 signature; /* IRT_SIGNATURE should be here */
+ u8 size; /* Number of entries provided */
+ u8 used; /* Number of entries actually used */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ struct irq_info slots[];
+} __attribute__((packed));
+
extern unsigned int pcibios_irq_mask;
extern raw_spinlock_t pci_config_lock;
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a3c33b79fb86..13c0d63ed55e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -38,9 +38,9 @@
#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- asm volatile("add " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (this_cpu_off), "0" (ptr)); \
+ asm ("add " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (this_cpu_off), "0" (ptr)); \
(typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
})
#else
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 58d9e4b1fa0a..409725e86f42 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_PERF_EVENT_H
#define _ASM_X86_PERF_EVENT_H
+#include <linux/static_call.h>
+
/*
* Performance event hw details:
*/
@@ -184,6 +186,18 @@ union cpuid28_ecx {
unsigned int full;
};
+/*
+ * AMD "Extended Performance Monitoring and Debug" CPUID
+ * detection/enumeration details:
+ */
+union cpuid_0x80000022_ebx {
+ struct {
+ /* Number of Core Performance Counters */
+ unsigned int num_core_pmc:4;
+ } split;
+ unsigned int full;
+};
+
struct x86_pmu_capability {
int version;
int num_counters_gp;
@@ -241,6 +255,11 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_FIXED_SLOTS (INTEL_PMC_IDX_FIXED + 3)
#define INTEL_PMC_MSK_FIXED_SLOTS (1ULL << INTEL_PMC_IDX_FIXED_SLOTS)
+static inline bool use_fixed_pseudo_encoding(u64 code)
+{
+ return !(code & 0xff);
+}
+
/*
* We model BTS tracing as another fixed-mode PMC.
*
@@ -366,6 +385,11 @@ struct pebs_xmm {
};
/*
+ * AMD Extended Performance Monitoring and Debug cpuid feature detection
+ */
+#define EXT_PERFMON_DEBUG_FEATURES 0x80000022
+
+/*
* IBS cpuid feature detection
*/
@@ -386,6 +410,7 @@ struct pebs_xmm {
#define IBS_CAPS_OPBRNFUSE (1U<<8)
#define IBS_CAPS_FETCHCTLEXTD (1U<<9)
#define IBS_CAPS_OPDATA4 (1U<<10)
+#define IBS_CAPS_ZEN4 (1U<<11)
#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
| IBS_CAPS_FETCHSAM \
@@ -399,6 +424,7 @@ struct pebs_xmm {
#define IBSCTL_LVT_OFFSET_MASK 0x0F
/* IBS fetch bits/masks */
+#define IBS_FETCH_L3MISSONLY (1ULL<<59)
#define IBS_FETCH_RAND_EN (1ULL<<57)
#define IBS_FETCH_VAL (1ULL<<49)
#define IBS_FETCH_ENABLE (1ULL<<48)
@@ -415,6 +441,7 @@ struct pebs_xmm {
#define IBS_OP_CNT_CTL (1ULL<<19)
#define IBS_OP_VAL (1ULL<<18)
#define IBS_OP_ENABLE (1ULL<<17)
+#define IBS_OP_L3MISSONLY (1ULL<<16)
#define IBS_OP_MAX_CNT 0x0000FFFFULL
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */
@@ -513,6 +540,27 @@ static inline void intel_pt_handle_vmx(int on)
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
extern void amd_pmu_enable_virt(void);
extern void amd_pmu_disable_virt(void);
+
+#if defined(CONFIG_PERF_EVENTS_AMD_BRS)
+
+#define PERF_NEEDS_LOPWR_CB 1
+
+/*
+ * architectural low power callback impacts
+ * drivers/acpi/processor_idle.c
+ * drivers/acpi/acpi_pad.c
+ */
+extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
+
+DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
+
+static inline void perf_lopwr_cb(bool lopwr_in)
+{
+ static_call_mod(perf_lopwr_cb)(lopwr_in);
+}
+
+#endif /* PERF_NEEDS_LOPWR_CB */
+
#else
static inline void amd_pmu_enable_virt(void) { }
static inline void amd_pmu_disable_virt(void) { }
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 40497a9020c6..407084d9fd99 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -559,10 +559,6 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
-
-struct mm_struct;
-extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 1d5f14aff5f6..2e6c04d8a45b 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
-extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-
#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
@@ -118,11 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey)
return 0;
}
-extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val);
-
static inline int vma_pkey(struct vm_area_struct *vma)
{
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index feed36d44d04..12ef86b19910 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -13,6 +13,8 @@ void syscall_init(void);
#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void);
void entry_SYSCALL_64_safe_stack(void);
+void entry_SYSRETQ_unsafe_stack(void);
+void entry_SYSRETQ_end(void);
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
@@ -28,6 +30,8 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_SYSCALL_compat_safe_stack(void);
+void entry_SYSRETL_compat_unsafe_stack(void);
+void entry_SYSRETL_compat_end(void);
void entry_INT80_compat(void);
#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);
@@ -35,11 +39,9 @@ void xen_entry_INT80_compat(void);
#endif
void x86_configure_nx(void);
-void x86_report_nx(void);
extern int reboot_force;
-long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long arg2);
+long do_arch_prctl_common(int option, unsigned long arg2);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 4357e0f2cd5f..f4db78b09c8f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -186,9 +186,13 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs)
bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETQ_end);
#ifdef CONFIG_IA32_EMULATION
ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack &&
+ regs->ip < (unsigned long)entry_SYSRETL_compat_end);
#endif
return ret;
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 331474b150f1..fd6f6e5b755a 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -25,6 +25,7 @@ struct real_mode_header {
u32 sev_es_trampoline_start;
#endif
#ifdef CONFIG_X86_64
+ u32 trampoline_start64;
u32 trampoline_pgd;
#endif
/* ACPI S3 wakeup */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 656ed6531d03..2e7890dd58a4 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -350,18 +350,6 @@ static inline void __loadsegment_fs(unsigned short value)
#define savesegment(seg, value) \
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
-/*
- * x86-32 user GS accessors. This is ugly and could do with some cleaning up.
- */
-#ifdef CONFIG_X86_32
-# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
-# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-# define task_user_gs(tsk) ((tsk)->thread.gs)
-# define lazy_save_gs(v) savesegment(gs, (v))
-# define lazy_load_gs(v) loadsegment(gs, (v))
-# define load_gs_index(v) loadsegment(gs, (v))
-#endif /* X86_32 */
-
#endif /* !__ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 896e48d45828..7590ac2570b9 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -50,7 +50,6 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
-extern unsigned long __startup_secondary_64(void);
extern void startup_64_setup_env(unsigned long physbase);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
@@ -109,27 +108,19 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);
/*
- * Reserve space in the brk section. The name must be unique within
- * the file, and somewhat descriptive. The size is in bytes. Must be
- * used at file scope.
+ * Reserve space in the brk section. The name must be unique within the file,
+ * and somewhat descriptive. The size is in bytes.
*
- * (This uses a temp function to wrap the asm so we can pass it the
- * size parameter; otherwise we wouldn't be able to. We can't use a
- * "section" attribute on a normal variable because it always ends up
- * being @progbits, which ends up allocating space in the vmlinux
- * executable.)
+ * The allocation is done using inline asm (rather than using a section
+ * attribute on a normal variable) in order to allow the use of @nobits, so
+ * that it doesn't take up any space in the vmlinux file.
*/
-#define RESERVE_BRK(name,sz) \
- static void __section(".discard.text") __noendbr __used notrace \
- __brk_reservation_fn_##name##__(void) { \
- asm volatile ( \
- ".pushsection .brk_reservation,\"aw\",@nobits;" \
- ".brk." #name ":" \
- " 1:.skip %c0;" \
- " .size .brk." #name ", . - 1b;" \
- " .popsection" \
- : : "i" (sz)); \
- }
+#define RESERVE_BRK(name, size) \
+ asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \
+ ".brk." #name ":\n\t" \
+ ".skip " __stringify(size) "\n\t" \
+ ".size .brk." #name ", " __stringify(size) "\n\t" \
+ ".popsection\n\t")
extern void probe_roms(void);
#ifdef __i386__
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 1b2fd32b42fe..b8357d6ecd47 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -57,9 +57,79 @@
#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+/* GHCB GPA Register */
+#define GHCB_MSR_REG_GPA_REQ 0x012
+#define GHCB_MSR_REG_GPA_REQ_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_REG_GPA_REQ)
+
+#define GHCB_MSR_REG_GPA_RESP 0x013
+#define GHCB_MSR_REG_GPA_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+/*
+ * SNP Page State Change Operation
+ *
+ * GHCBData[55:52] - Page operation:
+ * 0x0001 Page assignment, Private
+ * 0x0002 Page assignment, Shared
+ */
+enum psc_op {
+ SNP_PAGE_STATE_PRIVATE = 1,
+ SNP_PAGE_STATE_SHARED,
+};
+
+#define GHCB_MSR_PSC_REQ 0x014
+#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \
+ /* GHCBData[55:52] */ \
+ (((u64)((op) & 0xf) << 52) | \
+ /* GHCBData[51:12] */ \
+ ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \
+ /* GHCBData[11:0] */ \
+ GHCB_MSR_PSC_REQ)
+
+#define GHCB_MSR_PSC_RESP 0x015
+#define GHCB_MSR_PSC_RESP_VAL(val) \
+ /* GHCBData[63:32] */ \
+ (((u64)(val) & GENMASK_ULL(63, 32)) >> 32)
+
/* GHCB Hypervisor Feature Request/Response */
#define GHCB_MSR_HV_FT_REQ 0x080
#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_RESP_VAL(v) \
+ /* GHCBData[63:12] */ \
+ (((u64)(v) & GENMASK_ULL(63, 12)) >> 12)
+
+#define GHCB_HV_FT_SNP BIT_ULL(0)
+#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1)
+
+/* SNP Page State Change NAE event */
+#define VMGEXIT_PSC_MAX_ENTRY 253
+
+struct psc_hdr {
+ u16 cur_entry;
+ u16 end_entry;
+ u32 reserved;
+} __packed;
+
+struct psc_entry {
+ u64 cur_page : 12,
+ gfn : 40,
+ operation : 4,
+ pagesize : 1,
+ reserved : 7;
+} __packed;
+
+struct snp_psc_desc {
+ struct psc_hdr hdr;
+ struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
+} __packed;
+
+/* Guest message request error code */
+#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32)
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
@@ -73,8 +143,20 @@
/* GHCBData[23:16] */ \
((((u64)reason_val) & 0xff) << 16))
+/* Error codes from reason set 0 */
+#define SEV_TERM_SET_GEN 0
#define GHCB_SEV_ES_GEN_REQ 0
#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
+#define GHCB_SNP_UNSUPPORTED 2
+
+/* Linux-specific reason codes (used with reason set 1) */
+#define SEV_TERM_SET_LINUX 1
+#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */
+#define GHCB_TERM_PSC 1 /* Page State Change failure */
+#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */
+#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */
+#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */
+#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ec060c433589..19514524f0f8 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -11,9 +11,10 @@
#include <linux/types.h>
#include <asm/insn.h>
#include <asm/sev-common.h>
+#include <asm/bootparam.h>
-#define GHCB_PROTO_OUR 0x0001UL
-#define GHCB_PROTOCOL_MAX 1ULL
+#define GHCB_PROTOCOL_MIN 1ULL
+#define GHCB_PROTOCOL_MAX 2ULL
#define GHCB_DEFAULT_USAGE 0ULL
#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
@@ -42,6 +43,24 @@ struct es_em_ctxt {
struct es_fault_info fi;
};
+/*
+ * AMD SEV Confidential computing blob structure. The structure is
+ * defined in OVMF UEFI firmware header:
+ * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h
+ */
+#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41
+struct cc_blob_sev_info {
+ u32 magic;
+ u16 version;
+ u16 reserved;
+ u64 secrets_phys;
+ u32 secrets_len;
+ u32 rsvd1;
+ u64 cpuid_phys;
+ u32 cpuid_len;
+ u32 rsvd2;
+} __packed;
+
void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
static inline u64 lower_bits(u64 val, unsigned int bits)
@@ -60,6 +79,61 @@ extern void vc_no_ghcb(void);
extern void vc_boot_ghcb(void);
extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
+/* Software defined (when rFlags.CF = 1) */
+#define PVALIDATE_FAIL_NOUPDATE 255
+
+/* RMP page size */
+#define RMP_PG_SIZE_4K 0
+
+#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
+
+/* SNP Guest message request */
+struct snp_req_data {
+ unsigned long req_gpa;
+ unsigned long resp_gpa;
+ unsigned long data_gpa;
+ unsigned int data_npages;
+};
+
+struct sev_guest_platform_data {
+ u64 secrets_gpa;
+};
+
+/*
+ * The secrets page contains 96-bytes of reserved field that can be used by
+ * the guest OS. The guest OS uses the area to save the message sequence
+ * number for each VMPCK.
+ *
+ * See the GHCB spec section Secret page layout for the format for this area.
+ */
+struct secrets_os_area {
+ u32 msg_seqno_0;
+ u32 msg_seqno_1;
+ u32 msg_seqno_2;
+ u32 msg_seqno_3;
+ u64 ap_jump_table_pa;
+ u8 rsvd[40];
+ u8 guest_usage[32];
+} __packed;
+
+#define VMPCK_KEY_LEN 32
+
+/* See the SNP spec version 0.9 for secrets page format */
+struct snp_secrets_page_layout {
+ u32 version;
+ u32 imien : 1,
+ rsvd1 : 31;
+ u32 fms;
+ u32 rsvd2;
+ u8 gosvw[16];
+ u8 vmpck0[VMPCK_KEY_LEN];
+ u8 vmpck1[VMPCK_KEY_LEN];
+ u8 vmpck2[VMPCK_KEY_LEN];
+ u8 vmpck3[VMPCK_KEY_LEN];
+ struct secrets_os_area os_area;
+ u8 rsvd3[3840];
+} __packed;
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern struct static_key_false sev_es_enable_key;
extern void __sev_es_ist_enter(struct pt_regs *regs);
@@ -87,12 +161,71 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
struct es_em_ctxt *ctxt,
u64 exit_code, u64 exit_info_1,
u64 exit_info_2);
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
+{
+ int rc;
+
+ /* "rmpadjust" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t"
+ : "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(attrs)
+ : "memory", "cc");
+
+ return rc;
+}
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
+{
+ bool no_rmpupdate;
+ int rc;
+
+ /* "pvalidate" mnemonic support in binutils 2.36 and newer */
+ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t"
+ CC_SET(c)
+ : CC_OUT(c) (no_rmpupdate), "=a"(rc)
+ : "a"(vaddr), "c"(rmp_psize), "d"(validate)
+ : "memory", "cc");
+
+ if (no_rmpupdate)
+ return PVALIDATE_FAIL_NOUPDATE;
+
+ return rc;
+}
+void setup_ghcb(void);
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages);
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages);
+void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
+void snp_set_memory_shared(unsigned long vaddr, unsigned int npages);
+void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
+void snp_set_wakeup_secondary_cpu(void);
+bool snp_init(struct boot_params *bp);
+void snp_abort(void);
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
+static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
+static inline void setup_ghcb(void) { }
+static inline void __init
+early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
+static inline void __init
+early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { }
+static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { }
+static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { }
+static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { }
+static inline void snp_set_wakeup_secondary_cpu(void) { }
+static inline bool snp_init(struct boot_params *bp) { return false; }
+static inline void snp_abort(void) { }
+static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input,
+ unsigned long *fw_err)
+{
+ return -ENOTTY;
+}
#endif
#endif
diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h
new file mode 100644
index 000000000000..c0ef921c0586
--- /dev/null
+++ b/arch/x86/include/asm/shared/io.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_IO_H
+#define _ASM_X86_SHARED_IO_H
+
+#include <linux/types.h>
+
+#define BUILDIO(bwl, bw, type) \
+static inline void __out##bwl(type value, u16 port) \
+{ \
+ asm volatile("out" #bwl " %" #bw "0, %w1" \
+ : : "a"(value), "Nd"(port)); \
+} \
+ \
+static inline type __in##bwl(u16 port) \
+{ \
+ type value; \
+ asm volatile("in" #bwl " %w1, %" #bw "0" \
+ : "=a"(value) : "Nd"(port)); \
+ return value; \
+}
+
+BUILDIO(b, b, u8)
+BUILDIO(w, w, u16)
+BUILDIO(l, , u32)
+#undef BUILDIO
+
+#define inb __inb
+#define inw __inw
+#define inl __inl
+#define outb __outb
+#define outw __outw
+#define outl __outl
+
+#endif
diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h
new file mode 100644
index 000000000000..1e6ec10b3a15
--- /dev/null
+++ b/arch/x86/include/asm/shared/msr.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_MSR_H
+#define _ASM_X86_SHARED_MSR_H
+
+struct msr {
+ union {
+ struct {
+ u32 l;
+ u32 h;
+ };
+ u64 q;
+ };
+};
+
+#endif /* _ASM_X86_SHARED_MSR_H */
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
new file mode 100644
index 000000000000..e53f26228fbb
--- /dev/null
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHARED_TDX_H
+#define _ASM_X86_SHARED_TDX_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+#define TDX_HYPERCALL_STANDARD 0
+
+#define TDX_HCALL_HAS_OUTPUT BIT(0)
+#define TDX_HCALL_ISSUE_STI BIT(1)
+
+#define TDX_CPUID_LEAF_ID 0x21
+#define TDX_IDENT "IntelTDX "
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Used in __tdx_hypercall() to pass down and get back registers' values of
+ * the TDCALL instruction when requesting services from the VMM.
+ *
+ * This is a software only structure and not part of the TDX module/VMM ABI.
+ */
+struct tdx_hypercall_args {
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+};
+
+/* Used to request services from the VMM */
+u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags);
+
+/* Called from __tdx_hypercall() for unrecoverable failure */
+void __tdx_hypercall_failed(void);
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_SHARED_TDX_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index d17b39893b79..bab490379c65 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -19,25 +19,14 @@
#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_SMAP
-
#define ASM_CLAC \
ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP
#define ASM_STAC \
ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP
-#else /* CONFIG_X86_SMAP */
-
-#define ASM_CLAC
-#define ASM_STAC
-
-#endif /* CONFIG_X86_SMAP */
-
#else /* __ASSEMBLY__ */
-#ifdef CONFIG_X86_SMAP
-
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
@@ -76,19 +65,6 @@ static __always_inline void smap_restore(unsigned long flags)
#define ASM_STAC \
ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP)
-#else /* CONFIG_X86_SMAP */
-
-static inline void clac(void) { }
-static inline void stac(void) { }
-
-static inline unsigned long smap_save(void) { return 0; }
-static inline void smap_restore(unsigned long flags) { }
-
-#define ASM_CLAC
-#define ASM_STAC
-
-#endif /* CONFIG_X86_SMAP */
-
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMAP_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 68c257a3de0d..45b18eb94fa1 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -184,14 +184,15 @@ static inline void wbinvd(void)
native_wbinvd();
}
-#ifdef CONFIG_X86_64
static inline void load_gs_index(unsigned int selector)
{
+#ifdef CONFIG_X86_64
native_load_gs_index(selector);
-}
-
+#else
+ loadsegment(gs, selector);
#endif
+}
#endif /* CONFIG_PARAVIRT_XXL */
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index ed4f8bb6c2d9..2d8dacd02643 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -26,6 +26,7 @@
".align 4 \n" \
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
+ ANNOTATE_NOENDBR \
insns " \n" \
".byte 0x53, 0x43, 0x54 \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
@@ -38,6 +39,8 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
+#define ARCH_DEFINE_STATIC_CALL_RET0_TRAMP(name) \
+ ARCH_DEFINE_STATIC_CALL_TRAMP(name, __static_call_return0)
#define ARCH_ADD_TRAMP_KEY(name) \
asm(".pushsection .static_call_tramp_key, \"a\" \n" \
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 7b132d0312eb..a800abb1a992 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -19,7 +19,6 @@ struct saved_context {
u16 gs;
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
- bool misc_enable_saved;
struct saved_msrs saved_msrs;
struct desc_ptr gdt_desc;
struct desc_ptr idt;
@@ -28,6 +27,7 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ bool misc_enable_saved;
} __attribute__((packed));
/* routines for saving/restoring kernel state */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 35bb35d28733..54df06687d83 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -14,9 +14,13 @@
* Image of the saved processor state, used by the low level ACPI suspend to
* RAM code and by the low level hibernation code.
*
- * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
- * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
- * still work as required.
+ * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S
+ * and make sure that __save/__restore_processor_state(), defined in
+ * arch/x86/power/cpu.c, still work as required.
+ *
+ * Because the structure is packed, make sure to avoid unaligned members. For
+ * optimisation purposes but also because tools like kmemleak only search for
+ * pointers that are aligned.
*/
struct saved_context {
struct pt_regs regs;
@@ -36,7 +40,6 @@ struct saved_context {
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
- bool misc_enable_saved;
struct saved_msrs saved_msrs;
unsigned long efer;
u16 gdt_pad; /* Unused */
@@ -48,6 +51,7 @@ struct saved_context {
unsigned long tr;
unsigned long safety;
unsigned long return_address;
+ bool misc_enable_saved;
} __attribute__((packed));
#define loaddebug(thread,register) \
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f70a5108d464..1b07fba11704 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -271,6 +271,7 @@ struct vmcb_seg {
u64 base;
} __packed;
+/* Save area definition for legacy and SEV-MEM guests */
struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
@@ -282,12 +283,12 @@ struct vmcb_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u8 reserved_1[43];
+ u8 reserved_1[42];
+ u8 vmpl;
u8 cpl;
u8 reserved_2[4];
u64 efer;
- u8 reserved_3[104];
- u64 xss; /* Valid for SEV-ES only */
+ u8 reserved_3[112];
u64 cr4;
u64 cr3;
u64 cr0;
@@ -297,7 +298,9 @@ struct vmcb_save_area {
u64 rip;
u8 reserved_4[88];
u64 rsp;
- u8 reserved_5[24];
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
u64 rax;
u64 star;
u64 lstar;
@@ -308,29 +311,145 @@ struct vmcb_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_6[32];
+ u8 reserved_5[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
-
- /*
- * The following part of the save area is valid only for
- * SEV-ES guests when referenced through the GHCB or for
- * saving to the host save area.
- */
- u8 reserved_7[72];
+ u8 reserved_6[72];
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
- u8 reserved_7b[4];
+} __packed;
+
+/* Save area definition for SEV-ES and SEV-SNP guests */
+struct sev_es_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u64 vmpl0_ssp;
+ u64 vmpl1_ssp;
+ u64 vmpl2_ssp;
+ u64 vmpl3_ssp;
+ u64 u_cet;
+ u8 reserved_1[2];
+ u8 vmpl;
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[104];
+ u64 xss;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 dr0;
+ u64 dr1;
+ u64 dr2;
+ u64 dr3;
+ u64 dr0_addr_mask;
+ u64 dr1_addr_mask;
+ u64 dr2_addr_mask;
+ u64 dr3_addr_mask;
+ u8 reserved_4[24];
+ u64 rsp;
+ u64 s_cet;
+ u64 ssp;
+ u64 isst_addr;
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_5[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+ u8 reserved_7[80];
u32 pkru;
- u8 reserved_7a[20];
- u64 reserved_8; /* rax already available at 0x01f8 */
+ u8 reserved_8[20];
+ u64 reserved_9; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_10; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_11[16];
+ u64 guest_exit_info_1;
+ u64 guest_exit_info_2;
+ u64 guest_exit_int_info;
+ u64 guest_nrip;
+ u64 sev_features;
+ u64 vintr_ctrl;
+ u64 guest_exit_code;
+ u64 virtual_tom;
+ u64 tlb_id;
+ u64 pcpu_id;
+ u64 event_inj;
+ u64 xcr0;
+ u8 reserved_12[16];
+
+ /* Floating point area */
+ u64 x87_dp;
+ u32 mxcsr;
+ u16 x87_ftw;
+ u16 x87_fsw;
+ u16 x87_fcw;
+ u16 x87_fop;
+ u16 x87_ds;
+ u16 x87_cs;
+ u64 x87_rip;
+ u8 fpreg_x87[80];
+ u8 fpreg_xmm[256];
+ u8 fpreg_ymm[256];
+} __packed;
+
+struct ghcb_save_area {
+ u8 reserved_1[203];
+ u8 cpl;
+ u8 reserved_2[116];
+ u64 xss;
+ u8 reserved_3[24];
+ u64 dr7;
+ u8 reserved_4[16];
+ u64 rip;
+ u8 reserved_5[88];
+ u64 rsp;
+ u8 reserved_6[24];
+ u64 rax;
+ u8 reserved_7[264];
u64 rcx;
u64 rdx;
u64 rbx;
- u64 reserved_9; /* rsp already available at 0x01d8 */
+ u8 reserved_8[8];
u64 rbp;
u64 rsi;
u64 rdi;
@@ -342,22 +461,24 @@ struct vmcb_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_10[16];
+ u8 reserved_9[16];
u64 sw_exit_code;
u64 sw_exit_info_1;
u64 sw_exit_info_2;
u64 sw_scratch;
- u8 reserved_11[56];
+ u8 reserved_10[56];
u64 xcr0;
u8 valid_bitmap[16];
u64 x87_state_gpa;
} __packed;
+#define GHCB_SHARED_BUF_SIZE 2032
+
struct ghcb {
- struct vmcb_save_area save;
- u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+ struct ghcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct ghcb_save_area)];
- u8 shared_buffer[2032];
+ u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
u8 reserved_1[10];
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
@@ -365,13 +486,17 @@ struct ghcb {
} __packed;
-#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 740
+#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
{
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
}
@@ -441,26 +566,26 @@ struct vmcb {
/* GHCB Accessor functions */
#define GHCB_BITMAP_IDX(field) \
- (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+ (offsetof(struct ghcb_save_area, field) / sizeof(u64))
#define DEFINE_GHCB_ACCESSORS(field) \
- static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
{ \
return test_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
} \
\
- static inline u64 ghcb_get_##field(struct ghcb *ghcb) \
+ static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \
{ \
return ghcb->save.field; \
} \
\
- static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
+ static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \
{ \
return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \
} \
\
- static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
{ \
__set_bit(GHCB_BITMAP_IDX(field), \
(unsigned long *)&ghcb->save.valid_bitmap); \
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
new file mode 100644
index 000000000000..020c81a7c729
--- /dev/null
+++ b/arch/x86/include/asm/tdx.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2021-2022 Intel Corporation */
+#ifndef _ASM_X86_TDX_H
+#define _ASM_X86_TDX_H
+
+#include <linux/init.h>
+#include <linux/bits.h>
+#include <asm/ptrace.h>
+#include <asm/shared/tdx.h>
+
+/*
+ * SW-defined error codes.
+ *
+ * Bits 47:40 == 0xFF indicate Reserved status code class that never used by
+ * TDX module.
+ */
+#define TDX_ERROR _BITUL(63)
+#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40))
+#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000))
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Used to gather the output registers values of the TDCALL and SEAMCALL
+ * instructions when requesting services from the TDX module.
+ *
+ * This is a software only structure and not part of the TDX module/VMM ABI.
+ */
+struct tdx_module_output {
+ u64 rcx;
+ u64 rdx;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+};
+
+/*
+ * Used by the #VE exception handler to gather the #VE exception
+ * info from the TDX module. This is a software only structure
+ * and not part of the TDX module/VMM ABI.
+ */
+struct ve_info {
+ u64 exit_reason;
+ u64 exit_qual;
+ /* Guest Linear (virtual) Address */
+ u64 gla;
+ /* Guest Physical Address */
+ u64 gpa;
+ u32 instr_len;
+ u32 instr_info;
+};
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+void __init tdx_early_init(void);
+
+/* Used to communicate with the TDX module */
+u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
+ struct tdx_module_output *out);
+
+void tdx_get_ve_info(struct ve_info *ve);
+
+bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+
+void tdx_safe_halt(void);
+
+bool tdx_early_handle_ve(struct pt_regs *regs);
+
+#else
+
+static inline void tdx_early_init(void) { };
+static inline void tdx_safe_halt(void) { };
+
+static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
+
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST)
+long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
+ unsigned long p3, unsigned long p4);
+#else
+static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
+ unsigned long p2, unsigned long p3,
+ unsigned long p4)
+{
+ return -ENODEV;
+}
+#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_TDX_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ebec69c35e95..f0cb881c1d69 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,7 +92,6 @@ struct thread_info {
#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */
-#define TIF_SLD 18 /* Restore split lock detection on context switch */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -116,7 +115,6 @@ struct thread_info {
#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL)
-#define _TIF_SLD (1 << TIF_SLD)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
@@ -128,7 +126,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW_BASE \
(_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \
- _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD)
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
/*
* Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
index a4a8b1b16c0c..956e4145311b 100644
--- a/arch/x86/include/asm/timex.h
+++ b/arch/x86/include/asm/timex.h
@@ -5,6 +5,15 @@
#include <asm/processor.h>
#include <asm/tsc.h>
+static inline unsigned long random_get_entropy(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
+ return random_get_entropy_fallback();
+ return rdtsc();
+}
+#define random_get_entropy random_get_entropy
+
/* Assume we use the PIT time source for the clock tick */
#define CLOCK_TICK_RATE PIT_TICK_RATE
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9619385bf749..458c891a8273 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -212,30 +212,19 @@ static inline long arch_scale_freq_capacity(int cpu)
}
#define arch_scale_freq_capacity arch_scale_freq_capacity
-extern void arch_scale_freq_tick(void);
-#define arch_scale_freq_tick arch_scale_freq_tick
-
extern void arch_set_max_freq_ratio(bool turbo_disabled);
-void init_freq_invariance(bool secondary, bool cppc_ready);
+extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
#else
-static inline void arch_set_max_freq_ratio(bool turbo_disabled)
-{
-}
-static inline void init_freq_invariance(bool secondary, bool cppc_ready)
-{
-}
+static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
+static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
#endif
+extern void arch_scale_freq_tick(void);
+#define arch_scale_freq_tick arch_scale_freq_tick
+
#ifdef CONFIG_ACPI_CPPC_LIB
void init_freq_invariance_cppc(void);
#define arch_init_invariance_cppc init_freq_invariance_cppc
-
-bool amd_set_max_freq_ratio(u64 *ratio);
-#else
-static inline bool amd_set_max_freq_ratio(u64 *ratio)
-{
- return false;
-}
#endif
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 35317c5c551d..47ecfff2c83d 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,7 +13,7 @@
#ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
-struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
+struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs);
void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 01a300a9700b..fbdc3d951494 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -20,13 +20,12 @@ extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
-#ifndef CONFIG_X86_TSC
- if (!boot_cpu_has(X86_FEATURE_TSC))
+ if (!IS_ENABLED(CONFIG_X86_TSC) &&
+ !cpu_feature_enabled(X86_FEATURE_TSC))
return 0;
-#endif
-
return rdtsc();
}
+#define get_cycles get_cycles
extern struct system_counterval_t convert_art_to_tsc(u64 art);
extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns);
diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h
index 7ee7ba0d63a3..769b939444ae 100644
--- a/arch/x86/include/uapi/asm/amd_hsmp.h
+++ b/arch/x86/include/uapi/asm/amd_hsmp.h
@@ -31,9 +31,22 @@ enum hsmp_message_ids {
HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */
HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */
HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */
- /* 13h Reserved */
- HSMP_GET_DDR_BANDWIDTH = 0x14, /* 14h Get theoretical maximum and current DDR Bandwidth */
- HSMP_GET_TEMP_MONITOR, /* 15h Get per-DIMM temperature and refresh rates */
+ HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */
+ HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */
+ HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */
+ HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */
+ HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */
+ HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */
+ HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */
+ HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */
+ HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */
+ HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */
+ HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */
+ HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */
+ HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */
+ HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */
+ HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */
+ HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */
HSMP_MSG_ID_MAX,
};
@@ -175,8 +188,12 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
*/
{1, 0, HSMP_SET},
- /* RESERVED message */
- {0, 0, HSMP_RSVD},
+ /*
+ * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1
+ * input: args[0] = nbioid[23:16]
+ * output: args[0] = max dpm level[15:8] + min dpm level[7:0]
+ */
+ {1, 1, HSMP_GET},
/*
* HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1
@@ -191,6 +208,93 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = {
* [7:5] fractional part
*/
{0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = refresh rate[3] + temperature range[2:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1
+ * input: args[0] = DIMM address[7:0]
+ * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] +
+ * DIMM address[7:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1
+ * output: args[0] = frequency in MHz[31:16] + frequency source[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1
+ * input: args[0] = apic id [31:0]
+ * output: args[0] = frequency in MHz[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1
+ * output: args[0] = power in mW[31:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1
+ * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0]
+ */
+ {0, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = io bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1
+ * input: args[0] = link id[15:8] + bw type[2:0]
+ * output: args[0] = xgmi bandwidth in Mbps[31:0]
+ */
+ {1, 1, HSMP_GET},
+
+ /*
+ * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0
+ * input: args[0] = min link width[15:8] + max link width[7:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1
+ * input: args[0] = link rate control value
+ * output: args[0] = previous link rate control value
+ */
+ {1, 1, HSMP_SET},
+
+ /*
+ * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0
+ * input: args[0] = power efficiency mode[2:0]
+ */
+ {1, 0, HSMP_SET},
+
+ /*
+ * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0
+ * input: args[0] = min df pstate[15:8] + max df pstate[7:0]
+ */
+ {1, 0, HSMP_SET},
};
/* Reset to default packing */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index b25d3f82c2f3..bea5cdcdf532 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -10,6 +10,7 @@
#define SETUP_EFI 4
#define SETUP_APPLE_PROPERTIES 5
#define SETUP_JAILHOUSE 6
+#define SETUP_CC_BLOB 7
#define SETUP_INDIRECT (1<<31)
@@ -187,7 +188,8 @@ struct boot_params {
__u32 ext_ramdisk_image; /* 0x0c0 */
__u32 ext_ramdisk_size; /* 0x0c4 */
__u32 ext_cmd_line_ptr; /* 0x0c8 */
- __u8 _pad4[116]; /* 0x0cc */
+ __u8 _pad4[112]; /* 0x0cc */
+ __u32 cc_blob_address; /* 0x13c */
struct edid_info edid_info; /* 0x140 */
struct efi_info efi_info; /* 0x1c0 */
__u32 alt_mem_k; /* 0x1e0 */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index efa969325ede..f69c168391aa 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -108,6 +108,14 @@
#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
+#define SVM_VMGEXIT_PSC 0x80000010
+#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011
+#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012
+#define SVM_VMGEXIT_AP_CREATION 0x80000013
+#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0
+#define SVM_VMGEXIT_AP_CREATE 1
+#define SVM_VMGEXIT_AP_DESTROY 2
+#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
/* Exit code reserved for hypervisor/software use */
@@ -218,6 +226,11 @@
{ SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \
{ SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \
{ SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \
+ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \
+ { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \
+ { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \
+ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \
+ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \
{ SVM_EXIT_ERR, "invalid_guest_state" }
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c41ef42adbe8..1a2dc328cb5e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,8 +46,6 @@ endif
# non-deterministic coverage.
KCOV_INSTRUMENT := n
-CFLAGS_head$(BITS).o += -fno-stack-protector
-
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0d01e7f5078c..6d2c50819501 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
static bool acpi_support_online_capable;
#endif
+#ifdef CONFIG_X86_64
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr;
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+#endif
+
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
-#endif /*CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_X86_64
+static int acpi_wakeup_cpu(int apicid, unsigned long start_ip)
+{
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgement. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
@@ -1083,6 +1143,29 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
+
+#ifdef CONFIG_X86_64
+static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ return -ENODEV;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+ if (BAD_MADT_ENTRY(mp_wake, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
+
+ acpi_wake_cpu_handler_update(acpi_wakeup_cpu);
+
+ return 0;
+}
+#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1278,6 +1361,14 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
+
+#ifdef CONFIG_X86_64
+ /*
+ * Parse MADT MP Wake entry.
+ */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP,
+ acpi_parse_mp_wake, 1);
+#endif
}
if (error == -EINVAL) {
/*
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index df1644d9b3b6..8b8cbf22461a 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
return err;
}
-bool amd_set_max_freq_ratio(u64 *ratio)
+static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
u64 highest_perf, nominal_perf;
u64 perf_ratio;
int rc;
- if (!ratio)
- return false;
-
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
pr_debug("Could not retrieve perf counters (%d)\n", rc);
- return false;
+ return;
}
highest_perf = amd_get_highest_perf();
@@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio)
if (!highest_perf || !nominal_perf) {
pr_debug("Could not retrieve highest or nominal performance\n");
- return false;
+ return;
}
perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
@@ -79,25 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio)
perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
if (!perf_ratio) {
pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return false;
+ return;
}
- *ratio = perf_ratio;
- arch_set_max_freq_ratio(false);
-
- return true;
+ freq_invariance_set_perf_ratio(perf_ratio, false);
}
static DEFINE_MUTEX(freq_invariance_lock);
void init_freq_invariance_cppc(void)
{
- static bool secondary;
+ static bool init_done;
- mutex_lock(&freq_invariance_lock);
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
- init_freq_invariance(secondary, true);
- secondary = true;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
+ mutex_lock(&freq_invariance_lock);
+ if (!init_done)
+ amd_set_max_freq_ratio();
+ init_done = true;
mutex_unlock(&freq_invariance_lock);
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d374cb3cf024..3c66073e7645 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -338,7 +338,7 @@ next:
}
}
-#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION)
+#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
* CALL/JMP *%\reg
@@ -507,11 +507,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
}
}
-#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */
+#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */
+#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 020c906f7934..190e0f763375 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -188,7 +188,7 @@ int amd_smn_write(u16 node, u32 address, u32 value)
EXPORT_SYMBOL_GPL(amd_smn_write);
-int amd_cache_northbridges(void)
+static int amd_cache_northbridges(void)
{
const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *link_ids = amd_nb_link_ids;
@@ -210,14 +210,14 @@ int amd_cache_northbridges(void)
}
misc = NULL;
- while ((misc = next_northbridge(misc, misc_ids)) != NULL)
+ while ((misc = next_northbridge(misc, misc_ids)))
misc_count++;
if (!misc_count)
return -ENODEV;
root = NULL;
- while ((root = next_northbridge(root, root_ids)) != NULL)
+ while ((root = next_northbridge(root, root_ids)))
root_count++;
if (root_count) {
@@ -290,7 +290,6 @@ int amd_cache_northbridges(void)
return 0;
}
-EXPORT_SYMBOL_GPL(amd_cache_northbridges);
/*
* Ignores subdevice/subvendor but as far as I can figure out
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b70344bf6600..189d3a5e471a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -170,7 +170,7 @@ static __init int setup_apicpmtimer(char *s)
{
apic_calibrate_pmtmr = 1;
notsc_setup(NULL);
- return 0;
+ return 1;
}
__setup("apicpmtimer", setup_apicpmtimer);
#endif
@@ -320,6 +320,9 @@ int lapic_get_maxlvt(void)
#define APIC_DIVISOR 16
#define TSC_DIVISOR 8
+/* i82489DX specific */
+#define I82489DX_BASE_DIVIDER (((0x2) << 18))
+
/*
* This function sets up the local APIC timer, with a timeout of
* 'clocks' APIC bus clock. During calibration we actually call
@@ -340,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
+ /*
+ * The i82489DX APIC uses bit 18 and 19 for the base divider. This
+ * overlaps with bit 18 on integrated APICs, but is not documented
+ * in the SDM. No problem though. i82489DX equipped systems do not
+ * have TSC deadline timer.
+ */
if (!lapic_is_integrated())
- lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ lvtt_value |= I82489DX_BASE_DIVIDER;
if (!irqen)
lvtt_value |= APIC_LVT_MASKED;
@@ -1419,22 +1428,21 @@ void __init apic_intr_mode_init(void)
return;
case APIC_VIRTUAL_WIRE:
pr_info("APIC: Switch to virtual wire mode setup\n");
- default_setup_apic_routing();
break;
case APIC_VIRTUAL_WIRE_NO_CONFIG:
pr_info("APIC: Switch to virtual wire mode setup with no configuration\n");
upmode = true;
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO:
pr_info("APIC: Switch to symmetric I/O mode setup\n");
- default_setup_apic_routing();
break;
case APIC_SYMMETRIC_IO_NO_ROUTING:
pr_info("APIC: Switch to symmetric I/O mode setup in no SMP routine\n");
break;
}
+ default_setup_apic_routing();
+
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
@@ -2551,6 +2559,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
}
EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+#ifdef CONFIG_X86_64
+void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
+ (*drv)->wakeup_secondary_cpu_64 = handler;
+}
+#endif
+
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c1bb384935b0..a868b76cd3d4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -65,6 +65,7 @@
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/pgtable.h>
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
@@ -2677,6 +2678,19 @@ static struct resource * __init ioapic_setup_resources(void)
return res;
}
+static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
+{
+ pgprot_t flags = FIXMAP_PAGE_NOCACHE;
+
+ /*
+ * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot
+ * bits, just like normal ioremap():
+ */
+ flags = pgprot_decrypted(flags);
+
+ __set_fixmap(idx, phys, flags);
+}
+
void __init io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
@@ -2709,7 +2723,7 @@ fake_ioapic_page:
__func__, PAGE_SIZE, PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
- set_fixmap_nocache(idx, ioapic_phys);
+ io_apic_set_fixmap(idx, ioapic_phys);
apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
ioapic_phys);
@@ -2838,7 +2852,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ io_apic_set_fixmap(FIX_IO_APIC_BASE_0 + idx, address);
if (bad_ioapic_register(idx)) {
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENODEV;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f5a48e66e4f5..482855227964 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -199,7 +199,13 @@ static void __init uv_tsc_check_sync(void)
int mmr_shift;
char *state;
- /* Different returns from different UV BIOS versions */
+ /* UV5 guarantees synced TSCs; do not zero TSC_ADJUST */
+ if (!is_uv(UV2|UV3|UV4)) {
+ mark_tsc_async_resets("UV5+");
+ return;
+ }
+
+ /* UV2,3,4, UV BIOS TSC sync state available */
mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
mmr_shift =
is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
@@ -1340,7 +1346,7 @@ static void __init decode_gam_params(unsigned long ptr)
static void __init decode_gam_rng_tbl(unsigned long ptr)
{
struct uv_gam_range_entry *gre = (struct uv_gam_range_entry *)ptr;
- unsigned long lgre = 0;
+ unsigned long lgre = 0, gend = 0;
int index = 0;
int sock_min = 999999, pnode_min = 99999;
int sock_max = -1, pnode_max = -1;
@@ -1374,6 +1380,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
flag, size, suffix[order],
gre->type, gre->nasid, gre->sockid, gre->pnode);
+ if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
+ gend = (unsigned long)gre->limit << UV_GAM_RANGE_SHFT;
+
/* update to next range start */
lgre = gre->limit;
if (sock_min > gre->sockid)
@@ -1391,7 +1400,8 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
_max_pnode = pnode_max;
_gr_table_len = index;
- pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x), pnodes(min:%x,max:%x), gap_end(%d)\n",
+ index, _min_socket, _max_socket, _min_pnode, _max_pnode, fls64(gend));
}
/* Walk through UVsystab decoding the fields */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9fb0a2f8b62a..437308004ef2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -18,6 +18,7 @@
#include <asm/bootparam.h>
#include <asm/suspend.h>
#include <asm/tlbflush.h>
+#include <asm/tdx.h>
#ifdef CONFIG_XEN
#include <xen/interface/xen.h>
@@ -66,6 +67,22 @@ static void __used common(void)
#endif
BLANK();
+ OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx);
+ OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx);
+ OFFSET(TDX_MODULE_r8, tdx_module_output, r8);
+ OFFSET(TDX_MODULE_r9, tdx_module_output, r9);
+ OFFSET(TDX_MODULE_r10, tdx_module_output, r10);
+ OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
+
+ BLANK();
+ OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
+ OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
+ OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
+ OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
+ OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
+ OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
+
+ BLANK();
OFFSET(BP_scratch, boot_params, scratch);
OFFSET(BP_secure_boot, boot_params, secure_boot);
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 9ca008f9e9b1..1f60a2b27936 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -6,146 +6,446 @@
* Copyright (C) 2017 Intel Corp.
* Author: Len Brown <len.brown@intel.com>
*/
-
+#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
-#include <linux/cpufreq.h>
-#include <linux/smp.h>
-#include <linux/sched/isolation.h>
#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
#include "cpu.h"
-struct aperfmperf_sample {
- unsigned int khz;
- atomic_t scfpending;
- ktime_t time;
- u64 aperf;
- u64 mperf;
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
};
-static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
-#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 10
-#define APERFMPERF_STALE_THRESHOLD_MS 1000
+static void init_counter_refs(void)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
- * aperfmperf_snapshot_khz()
- * On the current CPU, snapshot APERF, MPERF, and jiffies
- * unless we already did it within 10ms
- * calculate kHz, save snapshot
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
*/
-static void aperfmperf_snapshot_khz(void *dummy)
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
{
- u64 aperf, aperf_delta;
- u64 mperf, mperf_delta;
- struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- unsigned long flags;
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
- local_irq_save(flags);
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
- local_irq_restore(flags);
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(model) \
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(XEON_PHI_KNL),
+ X86_MATCH(XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(ATOM_GOLDMONT),
+ X86_MATCH(ATOM_GOLDMONT_D),
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
- aperf_delta = aperf - s->aperf;
- mperf_delta = mperf - s->mperf;
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
/*
- * There is no architectural guarantee that MPERF
- * increments faster than we can read it.
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (mperf_delta == 0)
- return;
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
- s->time = ktime_get();
- s->aperf = aperf;
- s->mperf = mperf;
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
- atomic_set_release(&s->scfpending, 0);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
}
-static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
+#ifdef CONFIG_PM_SLEEP
+static struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static void register_freq_invariance_syscore_ops(void)
{
- s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ register_syscore_ops(&freq_invariance_syscore_ops);
+}
+#else
+static inline void register_freq_invariance_syscore_ops(void) {}
+#endif
- /* Don't bother re-computing within the cache threshold time. */
- if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return true;
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable(&arch_scale_freq_key);
+ register_freq_invariance_syscore_ops();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
- if (!atomic_xchg(&s->scfpending, 1) || wait)
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (intel_set_max_freq_ratio())
+ freq_invariance_enable();
+}
- /* Return false if the previous iteration was too long ago. */
- return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
}
-unsigned int aperfmperf_get_khz(int cpu)
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- if (!cpu_khz)
- return 0;
+ u64 freq_scale;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ if (!arch_scale_freq_invariant())
+ return;
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
- if (rcu_is_idle_cpu(cpu))
- return 0; /* Idle CPUs are completely uninteresting. */
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
- aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
- return per_cpu(samples.khz, cpu);
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
-void arch_freq_prepare_all(void)
+void arch_scale_freq_tick(void)
{
- ktime_t now = ktime_get();
- bool wait = false;
- int cpu;
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
- if (!cpu_khz)
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
return;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
- for_each_online_cpu(cpu) {
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- continue;
- if (rcu_is_idle_cpu(cpu))
- continue; /* Idle CPUs are completely uninteresting. */
- if (!aperfmperf_snapshot_cpu(cpu, now, false))
- wait = true;
- }
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
- if (wait)
- msleep(APERFMPERF_REFRESH_DELAY_MS);
+ scale_freq_tick(acnt, mcnt);
}
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
unsigned int arch_freq_get_on_cpu(int cpu)
{
- struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
- if (!cpu_khz)
- return 0;
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return 0;
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
- if (!housekeeping_cpu(cpu, HK_TYPE_MISC))
- return 0;
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
- if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
- return per_cpu(samples.khz, cpu);
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
- msleep(APERFMPERF_REFRESH_DELAY_MS);
- atomic_set(&s->scfpending, 1);
- smp_mb(); /* ->scfpending before smp_call_function_single(). */
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+ init_counter_refs();
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
- return per_cpu(samples.khz, cpu);
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs();
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6296e1ebed1d..d879a6c93609 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -446,6 +446,13 @@ void update_srbds_msr(void)
if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
return;
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ed4417500700..2e9142797c99 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -60,6 +60,7 @@
#include <asm/uv/uv.h>
#include <asm/sigframe.h>
#include <asm/traps.h>
+#include <asm/sev.h>
#include "cpu.h"
@@ -298,13 +299,6 @@ static int __init cachesize_setup(char *str)
}
__setup("cachesize=", cachesize_setup);
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
-}
-__setup("nosep", x86_sep_setup);
-
/* Standard macro to see if a specific flag is changeable */
static inline int flag_is_changeable_p(u32 flag)
{
@@ -376,26 +370,12 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
}
#endif
-static __init int setup_disable_smep(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
- return 1;
-}
-__setup("nosmep", setup_disable_smep);
-
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
cr4_set_bits(X86_CR4_SMEP);
}
-static __init int setup_disable_smap(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_SMAP);
- return 1;
-}
-__setup("nosmap", setup_disable_smap);
-
static __always_inline void setup_smap(struct cpuinfo_x86 *c)
{
unsigned long eflags = native_save_fl();
@@ -403,14 +383,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
/* This should have been cleared long ago */
BUG_ON(eflags & X86_EFLAGS_AC);
- if (cpu_has(c, X86_FEATURE_SMAP)) {
-#ifdef CONFIG_X86_SMAP
+ if (cpu_has(c, X86_FEATURE_SMAP))
cr4_set_bits(X86_CR4_SMAP);
-#else
- clear_cpu_cap(c, X86_FEATURE_SMAP);
- cr4_clear_bits(X86_CR4_SMAP);
-#endif
- }
}
static __always_inline void setup_umip(struct cpuinfo_x86 *c)
@@ -1368,8 +1342,8 @@ static void detect_nopl(void)
static void __init cpu_parse_early_param(void)
{
char arg[128];
- char *argptr = arg;
- int arglen, res, bit;
+ char *argptr = arg, *opt;
+ int arglen, taint = 0;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -1397,21 +1371,61 @@ static void __init cpu_parse_early_param(void)
return;
pr_info("Clearing CPUID bits:");
- do {
- res = get_option(&argptr, &bit);
- if (res == 0 || res == 3)
- break;
- /* If the argument was too long, the last bit may be cut off */
- if (res == 1 && arglen >= sizeof(arg))
- break;
+ while (argptr) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
- if (bit >= 0 && bit < NCAPINTS * 32) {
- pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ opt = strsep(&argptr, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit));
+ else
+#endif
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+
+ setup_clear_cpu_cap(bit);
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+#ifdef CONFIG_X86_FEATURE_NAMES
+ for (bit = 0; bit < 32 * NCAPINTS; bit++) {
+ if (!x86_cap_flag(bit))
+ continue;
+
+ if (strcmp(x86_cap_flag(bit), opt))
+ continue;
+
+ pr_cont(" %s", opt);
setup_clear_cpu_cap(bit);
+ taint++;
+ found = true;
+ break;
}
- } while (res == 2);
+
+ if (!found)
+ pr_cont(" (unknown: %s)", opt);
+#endif
+ }
pr_cont("\n");
+
+ if (taint)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
}
/*
@@ -1855,15 +1869,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
-}
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
- setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
- return 1;
+ tsx_ap_init();
}
-__setup("noclflush", setup_noclflush);
void print_cpu_info(struct cpuinfo_x86 *c)
{
@@ -2124,6 +2132,9 @@ void cpu_init_exception_handling(void)
load_TR_desc();
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
+
/* Finally load the IDT */
load_current_idt();
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index ee6f23f7587d..2a8e584fc991 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -55,11 +55,10 @@ enum tsx_ctrl_states {
extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
-extern void tsx_enable(void);
-extern void tsx_disable(void);
-extern void tsx_clear_cpuid(void);
+void tsx_ap_init(void);
#else
static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void get_cpu_cap(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8321c43554a1..fd5dead8371c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,10 +7,13 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/workqueue.h>
#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -91,7 +94,7 @@ static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
{
ring3mwait_disabled = true;
- return 0;
+ return 1;
}
__setup("ring3mwait=disable", ring3mwait_disable);
@@ -181,6 +184,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
return false;
}
+int intel_cpu_collect_info(struct ucode_cpu_info *uci)
+{
+ unsigned int val[2];
+ unsigned int family, model;
+ struct cpu_signature csig = { 0 };
+ unsigned int eax, ebx, ecx, edx;
+
+ memset(uci, 0, sizeof(*uci));
+
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ csig.sig = eax;
+
+ family = x86_family(eax);
+ model = x86_model(eax);
+
+ if (model >= 5 || family > 6) {
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ csig.pf = 1 << ((val[1] >> 18) & 7);
+ }
+
+ csig.rev = intel_get_microcode_revision();
+
+ uci->cpu_sig = csig;
+ uci->valid = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -717,13 +752,6 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_misc_features(c);
- if (tsx_ctrl_state == TSX_CTRL_ENABLE)
- tsx_enable();
- else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
- tsx_disable();
- else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
- tsx_clear_cpuid();
-
split_lock_init();
bus_lock_init();
@@ -1006,6 +1034,8 @@ static const struct {
static struct ratelimit_state bld_ratelimit;
+static DEFINE_SEMAPHORE(buslock_sem);
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
int len = strlen(opt), ratelimit;
@@ -1116,18 +1146,52 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
+
static void split_lock_warn(unsigned long ip)
{
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
+ int cpu;
- /*
- * Disable the split lock detection for this task so it can make
- * progress and set TIF_SLD so the detection is re-enabled via
- * switch_to_sld() when the task is scheduled out.
- */
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ /* misery factor #1, sleep 10ms before trying to execute split lock */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /* Misery factor #2, only allow one buslocked disabled core at a time */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
- set_tsk_thread_flag(current, TIF_SLD);
+ put_cpu();
}
bool handle_guest_split_lock(unsigned long ip)
@@ -1201,18 +1265,6 @@ void handle_bus_lock(struct pt_regs *regs)
}
/*
- * This function is called only when switching between tasks with
- * different split-lock detection modes. It sets the MSR for the
- * mode of the new task. This is right most of the time, but since
- * the MSR is shared by hyperthreads on a physical core there can
- * be glitches when the two threads need different modes.
- */
-void switch_to_sld(unsigned long tifn)
-{
- sld_update_msr(!(tifn & _TIF_SLD));
-}
-
-/*
* Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
* only be trusted if it is confirmed that a CPU model implements a
* specific feature at a particular bit position.
@@ -1237,6 +1289,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
{}
};
@@ -1281,10 +1334,14 @@ static void sld_state_show(void)
pr_info("disabled\n");
break;
case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
pr_info("#DB: warning on user-space bus_locks\n");
+ }
break;
case sld_fatal:
if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1940d305db1c..1c87501e0fa3 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1294,10 +1294,23 @@ out_free:
kfree(bank);
}
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
int mce_threshold_remove_device(unsigned int cpu)
{
struct threshold_bank **bp = this_cpu_read(threshold_banks);
- unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
if (!bp)
return 0;
@@ -1308,13 +1321,7 @@ int mce_threshold_remove_device(unsigned int cpu)
*/
this_cpu_write(threshold_banks, NULL);
- for (bank = 0; bank < numbanks; bank++) {
- if (bp[bank]) {
- threshold_remove_bank(bp[bank]);
- bp[bank] = NULL;
- }
- }
- kfree(bp);
+ __threshold_remove_device(bp);
return 0;
}
@@ -1351,15 +1358,14 @@ int mce_threshold_create_device(unsigned int cpu)
if (!(this_cpu_read(bank_map) & (1 << bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
- if (err)
- goto out_err;
+ if (err) {
+ __threshold_remove_device(bp);
+ return err;
+ }
}
this_cpu_write(threshold_banks, bp);
if (thresholding_irq_en)
mce_threshold_vector = amd_threshold_interrupt;
return 0;
-out_err:
- mce_threshold_remove_device(cpu);
- return err;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 981496e6bc0e..d775fcd74e98 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -69,7 +69,9 @@ DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
struct mce_bank {
u64 ctl; /* subevents to enable */
- bool init; /* initialise bank? */
+
+ __u64 init : 1, /* initialise bank? */
+ __reserved_1 : 63;
};
static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 1add86935349..00483d1c27e4 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -301,85 +301,65 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
}
}
-static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
- u64 mcx_cfg;
+ char *panic_msg = NULL;
+ int ret;
/*
- * We need to look at the following bits:
- * - "succor" bit (data poisoning support), and
- * - TCC bit (Task Context Corrupt)
- * in MCi_STATUS to determine error severity.
+ * Default return value: Action required, the error must be handled
+ * immediately.
*/
- if (!mce_flags.succor)
- return MCE_PANIC_SEVERITY;
-
- mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
-
- /* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
- if ((mcx_cfg & MCI_CONFIG_MCAX) &&
- (m->status & MCI_STATUS_TCC) &&
- (err_ctx == IN_KERNEL))
- return MCE_PANIC_SEVERITY;
-
- /* ...otherwise invoke hwpoison handler. */
- return MCE_AR_SEVERITY;
-}
-
-/*
- * See AMD Error Scope Hierarchy table in a newer BKDG. For example
- * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
- */
-static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
-{
- enum context ctx = error_context(m, regs);
+ ret = MCE_AR_SEVERITY;
/* Processor Context Corrupt, no need to fumble too much, die! */
- if (m->status & MCI_STATUS_PCC)
- return MCE_PANIC_SEVERITY;
-
- if (m->status & MCI_STATUS_UC) {
-
- if (ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
- /*
- * On older systems where overflow_recov flag is not present, we
- * should simply panic if an error overflow occurs. If
- * overflow_recov flag is present and set, then software can try
- * to at least kill process to prolong system operation.
- */
- if (mce_flags.overflow_recov) {
- if (mce_flags.smca)
- return mce_severity_amd_smca(m, ctx);
-
- /* kill current process */
- return MCE_AR_SEVERITY;
- } else {
- /* at least one error was not logged */
- if (m->status & MCI_STATUS_OVER)
- return MCE_PANIC_SEVERITY;
- }
-
- /*
- * For any other case, return MCE_UC_SEVERITY so that we log the
- * error and exit #MC handler.
- */
- return MCE_UC_SEVERITY;
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
}
/*
- * deferred error: poll handler catches these and adds to mce_ring so
- * memory-failure can take recovery actions.
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
*/
- if (m->status & MCI_STATUS_DEFERRED)
- return MCE_DEFERRED_SEVERITY;
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
/*
- * corrected error: poll handler catches these and passes responsibility
- * of decoding the error to EDAC
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
*/
- return MCE_KEEP_SEVERITY;
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
}
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index f955d25076ba..239ff5fcec6a 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -758,9 +758,9 @@ static struct subsys_interface mc_cpu_interface = {
};
/**
- * mc_bp_resume - Update boot CPU microcode during resume.
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
*/
-static void mc_bp_resume(void)
+void microcode_bsp_resume(void)
{
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -772,7 +772,7 @@ static void mc_bp_resume(void)
}
static struct syscore_ops mc_syscore_ops = {
- .resume = mc_bp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index d28a9f8f3fec..025c8f0cd948 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch;
/* last level cache size per core */
static int llc_size_per_core;
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
/*
* Returns 1 if update has been found, 0 otherwise.
*/
@@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
struct extended_signature *ext_sig;
int i;
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
return 1;
/* Look for ext. headers: */
@@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf)
ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
return 1;
ext_sig++;
}
@@ -342,37 +328,6 @@ next:
return patch;
}
-static int collect_cpu_info_early(struct ucode_cpu_info *uci)
-{
- unsigned int val[2];
- unsigned int family, model;
- struct cpu_signature csig = { 0 };
- unsigned int eax, ebx, ecx, edx;
-
- memset(uci, 0, sizeof(*uci));
-
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- csig.sig = eax;
-
- family = x86_family(eax);
- model = x86_model(eax);
-
- if ((model >= 5) || (family > 6)) {
- /* get processor flags from MSR 0x17 */
- native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- csig.pf = 1 << ((val[1] >> 18) & 7);
- }
-
- csig.rev = intel_get_microcode_revision();
-
- uci->cpu_sig = csig;
- uci->valid = 1;
-
- return 0;
-}
-
static void show_saved_mc(void)
{
#ifdef DEBUG
@@ -386,7 +341,7 @@ static void show_saved_mc(void)
return;
}
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
@@ -502,7 +457,7 @@ void show_ucode_info_early(void)
struct ucode_cpu_info uci;
if (delay_ucode_info) {
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
print_ucode_info(&uci, current_mc_date);
delay_ucode_info = 0;
}
@@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void)
if (!(cp.data && cp.size))
return 0;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
scan_microcode(cp.data, cp.size, &uci, true);
@@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
if (!(cp.data && cp.size))
return NULL;
- collect_cpu_info_early(uci);
+ intel_cpu_collect_info(uci);
return scan_microcode(cp.data, cp.size, uci, false);
}
@@ -712,7 +667,7 @@ void reload_ucode_intel(void)
struct microcode_intel *p;
struct ucode_cpu_info uci;
- collect_cpu_info_early(&uci);
+ intel_cpu_collect_info(&uci);
p = find_patch(&uci);
if (!p)
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 4eec8889b0ff..099b6f0d96bd 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -84,14 +84,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = aperfmperf_get_khz(cpu);
-
- if (!freq)
- freq = cpufreq_quick_get(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ unsigned int freq = arch_freq_get_on_cpu(cpu);
+
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 83f901e2c2df..f276aff521e8 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -341,14 +341,14 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus belong to parent ctrl group */
cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
return -EINVAL;
}
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Give any dropped cpus to parent rdtgroup */
cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
update_closid_rmid(tmpmask, prgrp);
@@ -359,7 +359,7 @@ static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu rmid
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
if (crgrp == rdtgrp)
@@ -394,7 +394,7 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
/* Check whether cpus are dropped from this group */
cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
/* Can't drop from default group */
if (rdtgrp == &rdtgroup_default) {
rdt_last_cmd_puts("Can't drop CPUs from default group\n");
@@ -413,12 +413,12 @@ static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
* and update per-cpu closid/rmid.
*/
cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
if (r == rdtgrp)
continue;
cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
- if (cpumask_weight(tmpmask1))
+ if (!cpumask_empty(tmpmask1))
cpumask_rdtgrp_clear(r, tmpmask1);
}
update_closid_rmid(tmpmask, rdtgrp);
@@ -488,7 +488,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
/* check that user didn't specify any offline cpus */
cpumask_andnot(tmpmask, newmask, cpu_online_mask);
- if (cpumask_weight(tmpmask)) {
+ if (!cpumask_empty(tmpmask)) {
ret = -EINVAL;
rdt_last_cmd_puts("Can only assign online CPUs\n");
goto unlock;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 4143b1e4c5c6..dbaa8326d6f2 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7c63a1911fae..3c24e6124d95 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,92 @@
#include "encls.h"
#include "sgx.h"
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
/*
* Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
* follow right after the EPC data in the backing storage. In addition to the
@@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
struct sgx_encl *encl = encl_page->encl;
pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
struct sgx_pageinfo pginfo;
struct sgx_backing b;
bool pcmd_page_empty;
@@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
else
page_index = PFN_DOWN(encl->size);
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
ret = sgx_encl_get_backing(encl, page_index, &b);
@@ -84,6 +176,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
}
memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
/*
* The area for the PCMD in the page was zeroed above. Check if the
@@ -94,12 +187,20 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
kunmap_atomic(pcmd_page);
kunmap_atomic((void *)(unsigned long)pginfo.contents);
- sgx_encl_put_backing(&b, false);
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
sgx_encl_truncate_backing_page(encl, page_index);
- if (pcmd_page_empty)
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_atomic(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_atomic(pcmd_page);
+ }
+
+ put_page(b.pcmd);
return ret;
}
@@ -645,15 +746,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
/**
* sgx_encl_put_backing() - Unpin the backing storage
* @backing: data for accessing backing storage for the page
- * @do_write: mark pages dirty
*/
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write)
+void sgx_encl_put_backing(struct sgx_backing *backing)
{
- if (do_write) {
- set_page_dirty(backing->pcmd);
- set_page_dirty(backing->contents);
- }
-
put_page(backing->pcmd);
put_page(backing->contents);
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index fec43ca65065..d44e7372151f 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -107,7 +107,7 @@ void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing);
-void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write);
+void sgx_encl_put_backing(struct sgx_backing *backing);
int sgx_encl_test_and_clear_young(struct mm_struct *mm,
struct sgx_encl_page *page);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8e4bc6453d26..ab4ec54bbdd9 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -191,6 +191,8 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
@@ -308,6 +310,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(epc_page, backing);
encl_page->epc_page = NULL;
encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
@@ -320,7 +323,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
- sgx_encl_put_backing(&secs_backing, true);
+ sgx_encl_put_backing(&secs_backing);
}
out:
@@ -379,11 +382,14 @@ static void sgx_reclaim_pages(void)
goto skip;
page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+
+ mutex_lock(&encl_page->encl->lock);
ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
- if (ret)
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
goto skip;
+ }
- mutex_lock(&encl_page->encl->lock);
encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
mutex_unlock(&encl_page->encl->lock);
continue;
@@ -411,7 +417,6 @@ skip:
encl_page = epc_page->owner;
sgx_reclaimer_write(epc_page, &backing[i]);
- sgx_encl_put_backing(&backing[i], true);
kref_put(&encl_page->encl->refcount, sgx_encl_release);
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 9c7a5f049292..ec7bbac3a9f2 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -19,7 +19,7 @@
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
-void tsx_disable(void)
+static void tsx_disable(void)
{
u64 tsx;
@@ -39,7 +39,7 @@ void tsx_disable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-void tsx_enable(void)
+static void tsx_enable(void)
{
u64 tsx;
@@ -58,7 +58,7 @@ void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool __init tsx_ctrl_is_supported(void)
+static bool tsx_ctrl_is_supported(void)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -84,7 +84,45 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
-void tsx_clear_cpuid(void)
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
{
u64 msr;
@@ -97,6 +135,39 @@ void tsx_clear_cpuid(void)
rdmsrl(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ } else if (tsx_ctrl_is_supported()) {
+ rdmsrl(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrl(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
}
}
@@ -105,14 +176,14 @@ void __init tsx_init(void)
char arg[5] = {};
int ret;
+ tsx_dev_mode_disable();
+
/*
- * Hardware will always abort a TSX transaction if both CPUID bits
- * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is
- * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them
- * here.
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
*/
- if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
- boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
tsx_clear_cpuid();
setup_clear_cpu_cap(X86_FEATURE_RTM);
@@ -175,3 +246,16 @@ void __init tsx_init(void)
setup_force_cpu_cap(X86_FEATURE_HLE);
}
}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e8326a8d1c5d..9730c88530fc 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -407,7 +407,7 @@ int crash_load_segments(struct kimage *image)
}
image->elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index a7f617a3981d..97529552dd24 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -37,7 +37,6 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
} else
memcpy(buf, vaddr + offset, csize);
- set_iounmap_nonlazy();
iounmap((void __iomem *)vaddr);
return csize;
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c049561f373a..e28ab0ecc537 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init;
*/
struct fpstate init_fpstate __ro_after_init;
-/*
- * Track whether the kernel is using the FPU state
- * currently.
- *
- * This flag is used:
- *
- * - by IRQ context code to potentially use the FPU
- * if it's unused.
- *
- * - to debug kernel_fpu_begin()/end() correctness
- */
+/* Track in-kernel FPU usage */
static DEFINE_PER_CPU(bool, in_kernel_fpu);
/*
@@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu);
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-static bool kernel_fpu_disabled(void)
-{
- return this_cpu_read(in_kernel_fpu);
-}
-
-static bool interrupted_kernel_fpu_idle(void)
-{
- return !kernel_fpu_disabled();
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static bool interrupted_user_mode(void)
-{
- struct pt_regs *regs = get_irq_regs();
- return regs && user_mode(regs);
-}
-
/*
* Can we use the FPU in kernel mode with the
* whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
*/
bool irq_fpu_usable(void)
{
- return !in_interrupt() ||
- interrupted_user_mode() ||
- interrupted_kernel_fpu_idle();
+ if (WARN_ON_ONCE(in_nmi()))
+ return false;
+
+ /* In kernel FPU usage already active? */
+ if (this_cpu_read(in_kernel_fpu))
+ return false;
+
+ /*
+ * When not in NMI or hard interrupt context, FPU can be used in:
+ *
+ * - Task context except from within fpregs_lock()'ed critical
+ * regions.
+ *
+ * - Soft interrupt processing context which cannot happen
+ * while in a fpregs_lock()'ed critical region.
+ */
+ if (!in_hardirq())
+ return true;
+
+ /*
+ * In hard interrupt context it's safe when soft interrupts
+ * are enabled, which means the interrupt did not hit in
+ * a fpregs_lock()'ed critical region.
+ */
+ return !softirq_count();
}
EXPORT_SYMBOL(irq_fpu_usable);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 39e1c8626ab9..c8340156bfd2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -142,7 +142,8 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
* Non-compacted format and legacy features use the cached fixed
* offsets.
*/
- if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE)
+ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
+ xfeature <= XFEATURE_SSE)
return xstate_offsets[xfeature];
/*
@@ -369,12 +370,12 @@ static void __init setup_init_fpu_buf(void)
/*
* All components are now in init state. Read the state back so
* that init_fpstate contains all non-zero init state. This only
- * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
* those use the init optimization which skips writing data for
* components in init state.
*
* XSAVE could be used, but that would require to reshuffle the
- * data when XSAVES is available because XSAVES uses xstate
+ * data when XSAVEC/S is available because XSAVEC/S uses xstate
* compaction. But doing so is a pointless exercise because most
* components have an all zeros init state except for the legacy
* ones (FP and SSE). Those can be saved with FXSAVE into the
@@ -584,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
*/
static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
{
- bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
+ bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
int i;
@@ -595,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
* Supervisor state components can be managed only by
* XSAVES.
*/
- if (!compacted && xfeature_is_supervisor(i)) {
+ if (!xsaves && xfeature_is_supervisor(i)) {
XSTATE_WARN_ON(1);
return false;
}
@@ -612,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
* the size of the *user* states. If we use it to size a buffer
* that we use 'XSAVES' on, we could potentially overflow the
* buffer because 'XSAVES' saves system states too.
+ *
+ * This also takes compaction into account. So this works for
+ * XSAVEC as well.
*/
-static unsigned int __init get_xsaves_size(void)
+static unsigned int __init get_compacted_size(void)
{
unsigned int eax, ebx, ecx, edx;
/*
@@ -623,6 +628,10 @@ static unsigned int __init get_xsaves_size(void)
* containing all the state components
* corresponding to bits currently set in
* XCR0 | IA32_XSS.
+ *
+ * When XSAVES is not available but XSAVEC is (virt), then there
+ * are no supervisor states, but XSAVEC still uses compacted
+ * format.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
return ebx;
@@ -632,13 +641,13 @@ static unsigned int __init get_xsaves_size(void)
* Get the total size of the enabled xstates without the independent supervisor
* features.
*/
-static unsigned int __init get_xsaves_size_no_independent(void)
+static unsigned int __init get_xsave_compacted_size(void)
{
u64 mask = xfeatures_mask_independent();
unsigned int size;
if (!mask)
- return get_xsaves_size();
+ return get_compacted_size();
/* Disable independent features. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
@@ -647,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void)
* Ask the hardware what size is required of the buffer.
* This is the size required for the task->fpu buffer.
*/
- size = get_xsaves_size();
+ size = get_compacted_size();
/* Re-enable independent features so XSAVES will work on them again. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
@@ -687,20 +696,21 @@ static int __init init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
unsigned int user_size, kernel_size, kernel_default_size;
- bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
/* Uncompacted user space size */
user_size = get_xsave_size_user();
/*
- * XSAVES kernel size includes supervisor states and
- * uses compacted format when available.
+ * XSAVES kernel size includes supervisor states and uses compacted
+ * format. XSAVEC uses compacted format, but does not save
+ * supervisor states.
*
- * XSAVE does not support supervisor states so
- * kernel and user size is identical.
+ * XSAVE[OPT] do not support supervisor states so kernel and user
+ * size is identical.
*/
if (compacted)
- kernel_size = get_xsaves_size_no_independent();
+ kernel_size = get_xsave_compacted_size();
else
kernel_size = user_size;
@@ -813,8 +823,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
if (!cpu_feature_enabled(X86_FEATURE_XFD))
fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
- fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
- XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
+ else
+ fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED;
fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
@@ -837,6 +850,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
*/
init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
+ /* Set up compaction feature bit */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
+ cpu_feature_enabled(X86_FEATURE_XSAVES))
+ setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
+
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -873,7 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
fpu_kernel_cfg.max_features,
fpu_kernel_cfg.max_size,
- boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
+ boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
return;
out_disable:
@@ -917,7 +935,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
return NULL;
- if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
return NULL;
}
@@ -1215,7 +1233,7 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
}
for (i = 0; i < XFEATURE_MAX; i++) {
- u64 mask = ((u64)1 << i);
+ mask = BIT_ULL(i);
if (hdr.xfeatures & mask) {
void *dst = __raw_xsave_addr(xsave, i);
@@ -1525,7 +1543,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
* vendors into extending XFD for the pre AMX states, especially
* AVX512.
*/
- bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
+ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
struct fpu *fpu = &current->group_leader->thread.fpu;
struct fpu_state_perm *perm;
unsigned int ksize, usize;
@@ -1687,16 +1705,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
* e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
* XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
*/
-long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
+long fpu_xstate_prctl(int option, unsigned long arg2)
{
u64 __user *uptr = (u64 __user *)arg2;
u64 permitted, supported;
unsigned long idx = arg2;
bool guest = false;
- if (tsk != current)
- return -EPERM;
-
switch (option) {
case ARCH_GET_XCOMP_SUPP:
supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index d22ace092ca2..5ad47031383b 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -16,7 +16,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
- if (cpu_feature_enabled(X86_FEATURE_XSAVES))
+ if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED))
xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}
@@ -79,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void)
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27"
#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
@@ -97,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void)
: "memory")
/*
- * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact
- * format and supervisor states in addition to modified optimization in
- * XSAVEOPT.
+ * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor
+ * states in addition to XSAVEC.
+ *
+ * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports
+ * compacted storage format in addition to XSAVEOPT.
*
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
* supports modified optimization which is not supported by XSAVE.
@@ -111,8 +114,9 @@ static inline u64 xfeatures_mask_independent(void)
* address of the instruction where we might get an exception at.
*/
#define XSTATE_XSAVE(st, lmask, hmask, err) \
- asm volatile(ALTERNATIVE_2(XSAVE, \
+ asm volatile(ALTERNATIVE_3(XSAVE, \
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
+ XSAVEC, X86_FEATURE_XSAVEC, \
XSAVES, X86_FEATURE_XSAVES) \
"\n" \
"xor %[err], %[err]\n" \
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1e31c7d21597..b09d73c2ba89 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -579,9 +579,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+#if defined(CONFIG_DYNAMIC_FTRACE) && !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
extern void ftrace_graph_call(void);
static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
@@ -610,18 +608,7 @@ int ftrace_disable_ftrace_graph_caller(void)
return ftrace_mod_jmp(ip, &ftrace_stub);
}
-#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
-int ftrace_enable_ftrace_graph_caller(void)
-{
- return 0;
-}
-
-int ftrace_disable_ftrace_graph_caller(void)
-{
- return 0;
-}
-#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
-#endif /* !CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
/*
* Hook the return address and push it in the stack of return addrs
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4f5ecbbaae77..bd4a34100ed0 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,6 +40,7 @@
#include <asm/extable.h>
#include <asm/trapnr.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
/*
* Manage page tables very early on.
@@ -143,7 +144,20 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
if (sme_get_me_mask()) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;
+
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ /*
+ * On SNP, transition the page to shared in the RMP table so that
+ * it is consistent with the page table attribute change.
+ *
+ * __start_bss_decrypted has a virtual address in the high range
+ * mapping (kernel .text). PVALIDATE, by way of
+ * early_snp_set_memory_shared(), requires a valid virtual
+ * address but the kernel is currently running off of the identity
+ * mapping so use __pa() to get a *currently* valid virtual address.
+ */
+ early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
+
i = pmd_index(vaddr);
pmd[i] -= sme_get_me_mask();
}
@@ -192,9 +206,6 @@ unsigned long __head __startup_64(unsigned long physaddr,
if (load_delta & ~PMD_PAGE_MASK)
for (;;);
- /* Activate Secure Memory Encryption (SME) if supported and enabled */
- sme_enable(bp);
-
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
@@ -308,15 +319,6 @@ unsigned long __head __startup_64(unsigned long physaddr,
return sme_postprocess_startup(bp, pmd);
}
-unsigned long __startup_secondary_64(void)
-{
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
-}
-
/* Wipe all early page tables except for the kernel symbol map */
static void __init reset_early_page_tables(void)
{
@@ -416,6 +418,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr)
trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
return;
+ if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs))
+ return;
+
early_fixup_exception(regs, trapnr);
}
@@ -514,6 +519,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
idt_setup_early_handler();
+ /* Needed before cc_platform_has() can be used for TDX */
+ tdx_early_init();
+
copy_bootdata(__va(real_mode_data));
/*
@@ -600,8 +608,10 @@ static void startup_64_load_idt(unsigned long physbase)
void early_setup_idt(void)
{
/* VMM Communication Exception */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ setup_ghcb();
set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+ }
bringup_idt_descr.address = (unsigned long)bringup_idt_table;
native_load_idt(&bringup_idt_descr);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b8e3019547a5..92c4afa2b729 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -65,10 +65,39 @@ SYM_CODE_START_NOALIGN(startup_64)
leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
+
+ /*
+ * initial_gs points to initial fixed_percpu_data struct with storage for
+ * the stack protector canary. Global pointer fixups are needed at this
+ * stage, so apply them as is done in fixup_pointer(), and initialize %gs
+ * such that the canary can be accessed at %gs:40 for subsequent C calls.
+ */
+ movl $MSR_GS_BASE, %ecx
+ movq initial_gs(%rip), %rax
+ movq $_text, %rdx
+ subq %rdx, %rax
+ addq %rdi, %rax
+ movq %rax, %rdx
+ shrq $32, %rdx
+ wrmsr
+
pushq %rsi
call startup_64_setup_env
popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /*
+ * Activate SEV/SME memory encryption if supported/enabled. This needs to
+ * be done now, since this also includes setup of the SEV-SNP CPUID table,
+ * which needs to be done before any CPUID instructions are executed in
+ * subsequent code.
+ */
+ movq %rsi, %rdi
+ pushq %rsi
+ call sme_enable
+ popq %rsi
+#endif
+
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
leaq .Lon_kernel_cs(%rip), %rax
@@ -134,16 +163,32 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
- pushq %rsi
- call __startup_secondary_64
- popq %rsi
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ movq sme_me_mask, %rax
+#else
+ xorq %rax, %rax
+#endif
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(init_top_pgt - __START_KERNEL_map), %rax
1:
+#ifdef CONFIG_X86_MCE
+ /*
+ * Preserve CR4.MCE if the kernel will enable #MC support.
+ * Clearing MCE may fault in some environments (that also force #MC
+ * support). Any machine check that occurs before #MC support is fully
+ * configured will crash the system regardless of the CR4.MCE value set
+ * here.
+ */
+ movq %cr4, %rcx
+ andl $X86_CR4_MCE, %ecx
+#else
+ movl $0, %ecx
+#endif
+
/* Enable PAE mode, PGE and LA57 */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ orl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
#ifdef CONFIG_X86_5LEVEL
testl $1, __pgtable_l5_enabled(%rip)
jz 1f
@@ -249,13 +294,23 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
rdmsr
+ /*
+ * Preserve current value of EFER for comparison and to skip
+ * EFER writes if no change was made (for TDX guest)
+ */
+ movl %eax, %edx
btsl $_EFER_SCE, %eax /* Enable System Call */
btl $20,%edi /* No Execute supported? */
jnc 1f
btsl $_EFER_NX, %eax
btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */
+ /* Avoid writing EFER if no change was made (for TDX guest) */
+1: cmpl %edx, %eax
+ je 1f
+ xor %edx, %edx
+ wrmsr /* Make changes effective */
+1:
/* Setup cr0 */
movl $CR0_STATE, %eax
/* Make changes effective */
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 608eb63bf044..a58c6bc1cd68 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = {
*/
INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
+#ifdef CONFIG_INTEL_TDX_GUEST
+ INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
+#endif
};
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a22deb58f86d..8b1c45c9cda8 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -69,6 +69,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
+static int has_guest_poll = 0;
/*
* No need for any "IO delay" on KVM
*/
@@ -706,14 +707,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static int kvm_suspend(void)
{
+ u64 val = 0;
+
kvm_guest_cpu_offline(false);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
+ rdmsrl(MSR_KVM_POLL_CONTROL, val);
+ has_guest_poll = !(val & 1);
+#endif
return 0;
}
static void kvm_resume(void)
{
kvm_cpu_online(raw_smp_processor_id());
+
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
+ if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
+ wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif
}
static struct syscore_ops kvm_syscore_ops = {
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e73f7df362f5..cec0bfa3bc04 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
- if (!action->handler)
+ if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list)))
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
list_add_rcu(&action->list, &desc->head);
else
list_add_tail_rcu(&action->list, &desc->head);
-
+
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler);
void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
- struct nmiaction *n;
+ struct nmiaction *n, *found = NULL;
unsigned long flags;
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name)
WARN(in_nmi(),
"Trying to free NMI (%s) from NMI context!\n", n->name);
list_del_rcu(&n->list);
+ found = n;
break;
}
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
- synchronize_rcu();
+ if (found) {
+ synchronize_rcu();
+ INIT_LIST_HEAD(&found->list);
+ }
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 36e84d904260..319fef37d9dc 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -21,6 +21,7 @@
#include <asm/sections.h>
#include <asm/io.h>
#include <asm/setup_arch.h>
+#include <asm/sev.h>
static struct resource system_rom_resource = {
.name = "System ROM",
@@ -197,11 +198,21 @@ static int __init romchecksum(const unsigned char *rom, unsigned long length)
void __init probe_roms(void)
{
- const unsigned char *rom;
unsigned long start, length, upper;
+ const unsigned char *rom;
unsigned char c;
int i;
+ /*
+ * The ROM memory range is not part of the e820 table and is therefore not
+ * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
+ * memory, and SNP requires encrypted memory to be validated before access.
+ * Do that here.
+ */
+ snp_prep_memory(video_rom_resource.start,
+ ((system_rom_resource.end + 1) - video_rom_resource.start),
+ SNP_PAGE_STATE_PRIVATE);
+
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b370767f5b19..58fb48d3004f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -46,6 +46,7 @@
#include <asm/proto.h>
#include <asm/frame.h>
#include <asm/unwind.h>
+#include <asm/tdx.h>
#include "process.h"
@@ -160,6 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
savesegment(ds, p->thread.ds);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
+ savesegment(gs, p->thread.gs);
/*
* Clear all status flags including IF and set fixed bit. 64bit
* does not have this initialization as the frame does not contain
@@ -191,10 +193,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
if (sp)
childregs->sp = sp;
-#ifdef CONFIG_X86_32
- task_user_gs(p) = get_user_gs(current_pt_regs());
-#endif
-
if (unlikely(p->flags & PF_IO_WORKER)) {
/*
* An IO thread is a user space thread, but it doesn't
@@ -334,7 +332,7 @@ static int get_cpuid_mode(void)
return !test_thread_flag(TIF_NOCPUID);
}
-static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+static int set_cpuid_mode(unsigned long cpuid_enabled)
{
if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
return -ENODEV;
@@ -405,7 +403,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
}
/**
- * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
+ * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode
*/
void native_tss_update_io_bitmap(void)
{
@@ -686,9 +684,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
/* Enforce MSR update to ensure consistent state */
__speculation_ctrl_update(~tifn, tifn);
}
-
- if ((tifp ^ tifn) & _TIF_SLD)
- switch_to_sld(tifn);
}
/*
@@ -873,6 +868,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
x86_idle = mwait_idle;
+ } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_info("using TDX aware idle routine\n");
+ x86_idle = tdx_safe_halt;
} else
x86_idle = default_idle;
}
@@ -985,20 +983,19 @@ unsigned long __get_wchan(struct task_struct *p)
return addr;
}
-long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long arg2)
+long do_arch_prctl_common(int option, unsigned long arg2)
{
switch (option) {
case ARCH_GET_CPUID:
return get_cpuid_mode();
case ARCH_SET_CPUID:
- return set_cpuid_mode(task, arg2);
+ return set_cpuid_mode(arg2);
case ARCH_GET_XCOMP_SUPP:
case ARCH_GET_XCOMP_PERM:
case ARCH_REQ_XCOMP_PERM:
case ARCH_GET_XCOMP_GUEST_PERM:
case ARCH_REQ_XCOMP_GUEST_PERM:
- return fpu_xstate_prctl(task, option, arg2);
+ return fpu_xstate_prctl(option, arg2);
}
return -EINVAL;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 26edb1cd07a4..2f314b170c9f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
unsigned long d0, d1, d2, d3, d6, d7;
unsigned short gs;
- if (user_mode(regs))
- gs = get_user_gs(regs);
- else
- savesegment(gs, gs);
+ savesegment(gs, gs);
show_ip(regs, log_lvl);
@@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task)
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
- set_user_gs(regs, 0);
+ loadsegment(gs, 0);
regs->fs = 0;
regs->ds = __USER_DS;
regs->es = __USER_DS;
@@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- lazy_save_gs(prev->gs);
+ savesegment(gs, prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- lazy_load_gs(next->gs);
+ loadsegment(gs, next->gs);
this_cpu_write(current_task, next_p);
@@ -222,5 +219,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return do_arch_prctl_common(current, option, arg2);
+ return do_arch_prctl_common(option, arg2);
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e459253649be..1962008fe743 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
ret = do_arch_prctl_64(current, option, arg2);
if (ret == -EINVAL)
- ret = do_arch_prctl_common(current, option, arg2);
+ ret = do_arch_prctl_common(option, arg2);
return ret;
}
@@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
#ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return do_arch_prctl_common(current, option, arg2);
+ return do_arch_prctl_common(option, arg2);
}
#endif
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 98d10ef60571..37c12fb92906 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -170,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
retval = *pt_regs_access(task_pt_regs(task), offset);
else {
if (task == current)
- retval = get_user_gs(task_pt_regs(task));
+ savesegment(gs, retval);
else
- retval = task_user_gs(task);
+ retval = task->thread.gs;
}
return retval;
}
@@ -210,7 +210,7 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- task_user_gs(task) = value;
+ task->thread.gs = value;
}
return 0;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c95b9ac5a457..249981bf3d8a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -756,6 +756,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
+void x86_configure_nx(void)
+{
+ if (boot_cpu_has(X86_FEATURE_NX))
+ __supported_pte_mask |= _PAGE_NX;
+ else
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+
+static void __init x86_report_nx(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "missing in CPU!\n");
+ } else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+ printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#else
+ /* 32bit non-PAE kernel, NX cannot be used */
+ printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+ "cannot be enabled: non-PAE kernel!\n");
+#endif
+ }
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -896,9 +920,7 @@ void __init setup_arch(char **cmdline_p)
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
- * console setup can safely call set_fixmap()). It may then be called
- * again from within noexec_setup() during parsing early parameters
- * to honor the respective command line option.
+ * console setup can safely call set_fixmap()).
*/
x86_configure_nx();
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index ce987688bbc0..b478edf43bec 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -14,6 +14,68 @@
#define has_cpuflag(f) boot_cpu_has(f)
#endif
+/* I/O parameters for CPUID-related helpers */
+struct cpuid_leaf {
+ u32 fn;
+ u32 subfn;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+};
+
+/*
+ * Individual entries of the SNP CPUID table, as defined by the SNP
+ * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
+ */
+struct snp_cpuid_fn {
+ u32 eax_in;
+ u32 ecx_in;
+ u64 xcr0_in;
+ u64 xss_in;
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u64 __reserved;
+} __packed;
+
+/*
+ * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
+ * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
+ * of 64 entries per CPUID table.
+ */
+#define SNP_CPUID_COUNT_MAX 64
+
+struct snp_cpuid_table {
+ u32 count;
+ u32 __reserved1;
+ u64 __reserved2;
+ struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
+} __packed;
+
+/*
+ * Since feature negotiation related variables are set early in the boot
+ * process they must reside in the .data section so as not to be zeroed
+ * out when the .bss section is later cleared.
+ *
+ * GHCB protocol version negotiated with the hypervisor.
+ */
+static u16 ghcb_version __ro_after_init;
+
+/* Copy of the SNP firmware's CPUID page. */
+static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
+
+/*
+ * These will be initialized based on CPUID table so that non-present
+ * all-zero leaves (for sparse tables) can be differentiated from
+ * invalid/out-of-range leaves. This is needed since all-zero leaves
+ * still need to be post-processed.
+ */
+static u32 cpuid_std_range_max __ro_after_init;
+static u32 cpuid_hyp_range_max __ro_after_init;
+static u32 cpuid_ext_range_max __ro_after_init;
+
static bool __init sev_es_check_cpu_features(void)
{
if (!has_cpuflag(X86_FEATURE_RDRAND)) {
@@ -24,15 +86,12 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void __noreturn sev_es_terminate(unsigned int reason)
+static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
- /*
- * Tell the hypervisor what went wrong - only reason-set 0 is
- * currently supported.
- */
- val |= GHCB_SEV_TERM_REASON(0, reason);
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
/* Request Guest Termination from Hypvervisor */
sev_es_wr_ghcb_msr(val);
@@ -42,6 +101,42 @@ static void __noreturn sev_es_terminate(unsigned int reason)
asm volatile("hlt\n" : : : "memory");
}
+/*
+ * The hypervisor features are available from GHCB version 2 onward.
+ */
+static u64 get_hv_features(void)
+{
+ u64 val;
+
+ if (ghcb_version < 2)
+ return 0;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ);
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP)
+ return 0;
+
+ return GHCB_MSR_HV_FT_RESP_VAL(val);
+}
+
+static void snp_register_ghcb_early(unsigned long paddr)
+{
+ unsigned long pfn = paddr >> PAGE_SHIFT;
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ /* If the response GPA is not ours then abort the guest */
+ if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
+ (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
+}
+
static bool sev_es_negotiate_protocol(void)
{
u64 val;
@@ -54,10 +149,12 @@ static bool sev_es_negotiate_protocol(void)
if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
return false;
- if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR ||
- GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR)
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
return false;
+ ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
+
return true;
}
@@ -104,10 +201,7 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt
if (ret == 1) {
u64 info = ghcb->save.sw_exit_info_2;
- unsigned long v;
-
- info = ghcb->save.sw_exit_info_2;
- v = info & SVM_EVTINJ_VEC_MASK;
+ unsigned long v = info & SVM_EVTINJ_VEC_MASK;
/* Check if exception information from hypervisor is sane. */
if ((info & SVM_EVTINJ_VALID) &&
@@ -130,7 +224,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
u64 exit_info_1, u64 exit_info_2)
{
/* Fill in protocol and format specifiers */
- ghcb->protocol_version = GHCB_PROTOCOL_MAX;
+ ghcb->protocol_version = ghcb_version;
ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
ghcb_set_sw_exit_code(ghcb, exit_code);
@@ -150,6 +244,290 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
return verify_exception_info(ghcb, ctxt);
}
+static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
+{
+ u64 val;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ return -EIO;
+
+ *reg = (val >> 32);
+
+ return 0;
+}
+
+static int sev_cpuid_hv(struct cpuid_leaf *leaf)
+{
+ int ret;
+
+ /*
+ * MSR protocol does not support fetching non-zero subfunctions, but is
+ * sufficient to handle current early-boot cases. Should that change,
+ * make sure to report an error rather than ignoring the index and
+ * grabbing random values. If this issue arises in the future, handling
+ * can be added here to use GHCB-page protocol for cases that occur late
+ * enough in boot that GHCB page is available.
+ */
+ if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn)
+ return -EINVAL;
+
+ ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx);
+ ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx);
+
+ return ret;
+}
+
+/*
+ * This may be called early while still running on the initial identity
+ * mapping. Use RIP-relative addressing to obtain the correct address
+ * while running with the initial identity mapping as well as the
+ * switch-over to kernel virtual addresses later.
+ */
+static const struct snp_cpuid_table *snp_cpuid_get_table(void)
+{
+ void *ptr;
+
+ asm ("lea cpuid_table_copy(%%rip), %0"
+ : "=r" (ptr)
+ : "p" (&cpuid_table_copy));
+
+ return ptr;
+}
+
+/*
+ * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
+ * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
+ * and 1 based on the corresponding features enabled by a particular
+ * combination of XCR0 and XSS registers so that a guest can look up the
+ * version corresponding to the features currently enabled in its XCR0/XSS
+ * registers. The only values that differ between these versions/table
+ * entries is the enabled XSAVE area size advertised via EBX.
+ *
+ * While hypervisors may choose to make use of this support, it is more
+ * robust/secure for a guest to simply find the entry corresponding to the
+ * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
+ * XSAVE area size using subfunctions 2 through 64, as documented in APM
+ * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
+ *
+ * Since base/legacy XSAVE area size is documented as 0x240, use that value
+ * directly rather than relying on the base size in the CPUID table.
+ *
+ * Return: XSAVE area size on success, 0 otherwise.
+ */
+static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ u64 xfeatures_found = 0;
+ u32 xsave_size = 0x240;
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
+ continue;
+ if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
+ continue;
+ if (xfeatures_found & (BIT_ULL(e->ecx_in)))
+ continue;
+
+ xfeatures_found |= (BIT_ULL(e->ecx_in));
+
+ if (compacted)
+ xsave_size += e->eax;
+ else
+ xsave_size = max(xsave_size, e->eax + e->ebx);
+ }
+
+ /*
+ * Either the guest set unsupported XCR0/XSS bits, or the corresponding
+ * entries in the CPUID table were not present. This is not a valid
+ * state to be in.
+ */
+ if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
+ return 0;
+
+ return xsave_size;
+}
+
+static bool
+snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i;
+
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
+
+ if (e->eax_in != leaf->fn)
+ continue;
+
+ if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
+ continue;
+
+ /*
+ * For 0xD subfunctions 0 and 1, only use the entry corresponding
+ * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
+ * See the comments above snp_cpuid_calc_xsave_size() for more
+ * details.
+ */
+ if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
+ if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
+ continue;
+
+ leaf->eax = e->eax;
+ leaf->ebx = e->ebx;
+ leaf->ecx = e->ecx;
+ leaf->edx = e->edx;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void snp_cpuid_hv(struct cpuid_leaf *leaf)
+{
+ if (sev_cpuid_hv(leaf))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
+}
+
+static int snp_cpuid_postprocess(struct cpuid_leaf *leaf)
+{
+ struct cpuid_leaf leaf_hv = *leaf;
+
+ switch (leaf->fn) {
+ case 0x1:
+ snp_cpuid_hv(&leaf_hv);
+
+ /* initial APIC ID */
+ leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
+ /* APIC enabled bit */
+ leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
+
+ /* OSXSAVE enabled bit */
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ leaf->ecx |= BIT(27);
+ break;
+ case 0x7:
+ /* OSPKE enabled bit */
+ leaf->ecx &= ~BIT(4);
+ if (native_read_cr4() & X86_CR4_PKE)
+ leaf->ecx |= BIT(4);
+ break;
+ case 0xB:
+ leaf_hv.subfn = 0;
+ snp_cpuid_hv(&leaf_hv);
+
+ /* extended APIC ID */
+ leaf->edx = leaf_hv.edx;
+ break;
+ case 0xD: {
+ bool compacted = false;
+ u64 xcr0 = 1, xss = 0;
+ u32 xsave_size;
+
+ if (leaf->subfn != 0 && leaf->subfn != 1)
+ return 0;
+
+ if (native_read_cr4() & X86_CR4_OSXSAVE)
+ xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ if (leaf->subfn == 1) {
+ /* Get XSS value if XSAVES is enabled. */
+ if (leaf->eax & BIT(3)) {
+ unsigned long lo, hi;
+
+ asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
+ : "c" (MSR_IA32_XSS));
+ xss = (hi << 32) | lo;
+ }
+
+ /*
+ * The PPR and APM aren't clear on what size should be
+ * encoded in 0xD:0x1:EBX when compaction is not enabled
+ * by either XSAVEC (feature bit 1) or XSAVES (feature
+ * bit 3) since SNP-capable hardware has these feature
+ * bits fixed as 1. KVM sets it to 0 in this case, but
+ * to avoid this becoming an issue it's safer to simply
+ * treat this as unsupported for SNP guests.
+ */
+ if (!(leaf->eax & (BIT(1) | BIT(3))))
+ return -EINVAL;
+
+ compacted = true;
+ }
+
+ xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
+ if (!xsave_size)
+ return -EINVAL;
+
+ leaf->ebx = xsave_size;
+ }
+ break;
+ case 0x8000001E:
+ snp_cpuid_hv(&leaf_hv);
+
+ /* extended APIC ID */
+ leaf->eax = leaf_hv.eax;
+ /* compute ID */
+ leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
+ /* node ID */
+ leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
+ break;
+ default:
+ /* No fix-ups needed, use values as-is. */
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
+ * should be treated as fatal by caller.
+ */
+static int snp_cpuid(struct cpuid_leaf *leaf)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return -EOPNOTSUPP;
+
+ if (!snp_cpuid_get_validated_func(leaf)) {
+ /*
+ * Some hypervisors will avoid keeping track of CPUID entries
+ * where all values are zero, since they can be handled the
+ * same as out-of-range values (all-zero). This is useful here
+ * as well as it allows virtually all guest configurations to
+ * work using a single SNP CPUID table.
+ *
+ * To allow for this, there is a need to distinguish between
+ * out-of-range entries and in-range zero entries, since the
+ * CPUID table entries are only a template that may need to be
+ * augmented with additional values for things like
+ * CPU-specific information during post-processing. So if it's
+ * not in the table, set the values to zero. Then, if they are
+ * within a valid CPUID range, proceed with post-processing
+ * using zeros as the initial values. Otherwise, skip
+ * post-processing and just return zeros immediately.
+ */
+ leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
+
+ /* Skip post-processing for out-of-range zero leafs. */
+ if (!(leaf->fn <= cpuid_std_range_max ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ return 0;
+ }
+
+ return snp_cpuid_postprocess(leaf);
+}
+
/*
* Boot VC Handler - This is the first VC handler during boot, there is no GHCB
* page yet, so it only supports the MSR based communication with the
@@ -157,40 +535,33 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
*/
void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
{
+ unsigned int subfn = lower_bits(regs->cx, 32);
unsigned int fn = lower_bits(regs->ax, 32);
- unsigned long val;
+ struct cpuid_leaf leaf;
+ int ret;
/* Only CPUID is supported via MSR protocol */
if (exit_code != SVM_EXIT_CPUID)
goto fail;
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
- goto fail;
- regs->ax = val >> 32;
+ leaf.fn = fn;
+ leaf.subfn = subfn;
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
- goto fail;
- regs->bx = val >> 32;
+ ret = snp_cpuid(&leaf);
+ if (!ret)
+ goto cpuid_done;
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ if (ret != -EOPNOTSUPP)
goto fail;
- regs->cx = val >> 32;
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
+ if (sev_cpuid_hv(&leaf))
goto fail;
- regs->dx = val >> 32;
+
+cpuid_done:
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
/*
* This is a VC handler and the #VC is only raised when SEV-ES is
@@ -221,7 +592,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
fail:
/* Terminate the guest */
- sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
@@ -481,12 +852,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ret;
}
+static int vc_handle_cpuid_snp(struct pt_regs *regs)
+{
+ struct cpuid_leaf leaf;
+ int ret;
+
+ leaf.fn = regs->ax;
+ leaf.subfn = regs->cx;
+ ret = snp_cpuid(&leaf);
+ if (!ret) {
+ regs->ax = leaf.eax;
+ regs->bx = leaf.ebx;
+ regs->cx = leaf.ecx;
+ regs->dx = leaf.edx;
+ }
+
+ return ret;
+}
+
static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
struct pt_regs *regs = ctxt->regs;
u32 cr4 = native_read_cr4();
enum es_result ret;
+ int snp_cpuid_ret;
+
+ snp_cpuid_ret = vc_handle_cpuid_snp(regs);
+ if (!snp_cpuid_ret)
+ return ES_OK;
+ if (snp_cpuid_ret != -EOPNOTSUPP)
+ return ES_VMM_ERROR;
ghcb_set_rax(ghcb, regs->ax);
ghcb_set_rcx(ghcb, regs->cx);
@@ -538,3 +934,67 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
return ES_OK;
}
+
+struct cc_setup_data {
+ struct setup_data header;
+ u32 cc_blob_address;
+};
+
+/*
+ * Search for a Confidential Computing blob passed in as a setup_data entry
+ * via the Linux Boot Protocol.
+ */
+static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+{
+ struct cc_setup_data *sd = NULL;
+ struct setup_data *hdr;
+
+ hdr = (struct setup_data *)bp->hdr.setup_data;
+
+ while (hdr) {
+ if (hdr->type == SETUP_CC_BLOB) {
+ sd = (struct cc_setup_data *)hdr;
+ return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address;
+ }
+ hdr = (struct setup_data *)hdr->next;
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize the kernel's copy of the SNP CPUID table, and set up the
+ * pointer that will be used to access it.
+ *
+ * Maintaining a direct mapping of the SNP CPUID table used by firmware would
+ * be possible as an alternative, but the approach is brittle since the
+ * mapping needs to be updated in sync with all the changes to virtual memory
+ * layout and related mapping facilities throughout the boot process.
+ */
+static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+{
+ const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
+ int i;
+
+ if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys;
+ if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX)
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
+
+ cpuid_table = snp_cpuid_get_table();
+ memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table));
+
+ /* Initialize CPUID ranges for range-checking. */
+ for (i = 0; i < cpuid_table->count; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ if (fn->eax_in == 0x0)
+ cpuid_std_range_max = fn->eax;
+ else if (fn->eax_in == 0x40000000)
+ cpuid_hyp_range_max = fn->eax;
+ else if (fn->eax_in == 0x80000000)
+ cpuid_ext_range_max = fn->eax;
+ }
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index e6d316a01fdd..c05f0124c410 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -18,6 +18,10 @@
#include <linux/memblock.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/efi.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
@@ -31,9 +35,28 @@
#include <asm/svm.h>
#include <asm/smp.h>
#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+#include <asm/cmdline.h>
#define DR7_RESET_VALUE 0x400
+/* AP INIT values as documented in the APM2 section "Processor Initialization State" */
+#define AP_INIT_CS_LIMIT 0xffff
+#define AP_INIT_DS_LIMIT 0xffff
+#define AP_INIT_LDTR_LIMIT 0xffff
+#define AP_INIT_GDTR_LIMIT 0xffff
+#define AP_INIT_IDTR_LIMIT 0xffff
+#define AP_INIT_TR_LIMIT 0xffff
+#define AP_INIT_RFLAGS_DEFAULT 0x2
+#define AP_INIT_DR6_DEFAULT 0xffff0ff0
+#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define AP_INIT_XCR0_DEFAULT 0x1
+#define AP_INIT_X87_FTW_DEFAULT 0x5555
+#define AP_INIT_X87_FCW_DEFAULT 0x0040
+#define AP_INIT_CR0_DEFAULT 0x60000010
+#define AP_INIT_MXCSR_DEFAULT 0x1f80
+
/* For early boot hypervisor communication in SEV-ES enabled guests */
static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
@@ -41,7 +64,10 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
* Needs to be in the .data section because we need it NULL before bss is
* cleared
*/
-static struct ghcb __initdata *boot_ghcb;
+static struct ghcb *boot_ghcb __section(".data");
+
+/* Bitmap of SEV features supported by the hypervisor */
+static u64 sev_hv_features __ro_after_init;
/* #VC handler runtime per-CPU data */
struct sev_es_runtime_data {
@@ -87,6 +113,15 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
+static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
+
+struct sev_config {
+ __u64 debug : 1,
+ __reserved : 63;
+};
+
+static struct sev_config sev_cfg __read_mostly;
+
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -523,13 +558,68 @@ void noinstr __sev_es_nmi_complete(void)
__sev_put_ghcb(&state);
}
-static u64 get_jump_table_addr(void)
+static u64 __init get_secrets_page(void)
+{
+ u64 pa_data = boot_params.cc_blob_address;
+ struct cc_blob_sev_info info;
+ void *map;
+
+ /*
+ * The CC blob contains the address of the secrets page, check if the
+ * blob is present.
+ */
+ if (!pa_data)
+ return 0;
+
+ map = early_memremap(pa_data, sizeof(info));
+ if (!map) {
+ pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
+ return 0;
+ }
+ memcpy(&info, map, sizeof(info));
+ early_memunmap(map, sizeof(info));
+
+ /* smoke-test the secrets page passed */
+ if (!info.secrets_phys || info.secrets_len != PAGE_SIZE)
+ return 0;
+
+ return info.secrets_phys;
+}
+
+static u64 __init get_snp_jump_table_addr(void)
+{
+ struct snp_secrets_page_layout *layout;
+ void __iomem *mem;
+ u64 pa, addr;
+
+ pa = get_secrets_page();
+ if (!pa)
+ return 0;
+
+ mem = ioremap_encrypted(pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
+ return 0;
+ }
+
+ layout = (__force struct snp_secrets_page_layout *)mem;
+
+ addr = layout->os_area.ap_jump_table_pa;
+ iounmap(mem);
+
+ return addr;
+}
+
+static u64 __init get_jump_table_addr(void)
{
struct ghcb_state state;
unsigned long flags;
struct ghcb *ghcb;
u64 ret = 0;
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return get_snp_jump_table_addr();
+
local_irq_save(flags);
ghcb = __sev_get_ghcb(&state);
@@ -553,7 +643,496 @@ static u64 get_jump_table_addr(void)
return ret;
}
-int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
+static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate)
+{
+ unsigned long vaddr_end;
+ int rc;
+
+ vaddr = vaddr & PAGE_MASK;
+ vaddr_end = vaddr + (npages << PAGE_SHIFT);
+
+ while (vaddr < vaddr_end) {
+ rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
+ if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
+
+ vaddr = vaddr + PAGE_SIZE;
+ }
+}
+
+static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op)
+{
+ unsigned long paddr_end;
+ u64 val;
+
+ paddr = paddr & PAGE_MASK;
+ paddr_end = paddr + (npages << PAGE_SHIFT);
+
+ while (paddr < paddr_end) {
+ /*
+ * Use the MSR protocol because this function can be called before
+ * the GHCB is established.
+ */
+ sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
+ VMGEXIT();
+
+ val = sev_es_rd_ghcb_msr();
+
+ if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
+ "Wrong PSC response code: 0x%x\n",
+ (unsigned int)GHCB_RESP_CODE(val)))
+ goto e_term;
+
+ if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
+ "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
+ op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
+ paddr, GHCB_MSR_PSC_RESP_VAL(val)))
+ goto e_term;
+
+ paddr = paddr + PAGE_SIZE;
+ }
+
+ return;
+
+e_term:
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /*
+ * Ask the hypervisor to mark the memory pages as private in the RMP
+ * table.
+ */
+ early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE);
+
+ /* Validate the memory pages after they've been added in the RMP table. */
+ pvalidate_pages(vaddr, npages, true);
+}
+
+void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /* Invalidate the memory pages before they are marked shared in the RMP table. */
+ pvalidate_pages(vaddr, npages, false);
+
+ /* Ask hypervisor to mark the memory pages shared in the RMP table. */
+ early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
+{
+ unsigned long vaddr, npages;
+
+ vaddr = (unsigned long)__va(paddr);
+ npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+ if (op == SNP_PAGE_STATE_PRIVATE)
+ early_snp_set_memory_private(vaddr, paddr, npages);
+ else if (op == SNP_PAGE_STATE_SHARED)
+ early_snp_set_memory_shared(vaddr, paddr, npages);
+ else
+ WARN(1, "invalid memory op %d\n", op);
+}
+
+static int vmgexit_psc(struct snp_psc_desc *desc)
+{
+ int cur_entry, end_entry, ret = 0;
+ struct snp_psc_desc *data;
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ unsigned long flags;
+ struct ghcb *ghcb;
+
+ /*
+ * __sev_get_ghcb() needs to run with IRQs disabled because it is using
+ * a per-CPU GHCB.
+ */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+ if (!ghcb) {
+ ret = 1;
+ goto out_unlock;
+ }
+
+ /* Copy the input desc into GHCB shared buffer */
+ data = (struct snp_psc_desc *)ghcb->shared_buffer;
+ memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
+
+ /*
+ * As per the GHCB specification, the hypervisor can resume the guest
+ * before processing all the entries. Check whether all the entries
+ * are processed. If not, then keep retrying. Note, the hypervisor
+ * will update the data memory directly to indicate the status, so
+ * reference the data->hdr everywhere.
+ *
+ * The strategy here is to wait for the hypervisor to change the page
+ * state in the RMP table before guest accesses the memory pages. If the
+ * page state change was not successful, then later memory access will
+ * result in a crash.
+ */
+ cur_entry = data->hdr.cur_entry;
+ end_entry = data->hdr.end_entry;
+
+ while (data->hdr.cur_entry <= data->hdr.end_entry) {
+ ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
+
+ /* This will advance the shared buffer data points to. */
+ ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
+
+ /*
+ * Page State Change VMGEXIT can pass error code through
+ * exit_info_2.
+ */
+ if (WARN(ret || ghcb->save.sw_exit_info_2,
+ "SNP: PSC failed ret=%d exit_info_2=%llx\n",
+ ret, ghcb->save.sw_exit_info_2)) {
+ ret = 1;
+ goto out;
+ }
+
+ /* Verify that reserved bit is not set */
+ if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Sanity check that entry processing is not going backwards.
+ * This will happen only if hypervisor is tricking us.
+ */
+ if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
+"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
+ end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ __sev_put_ghcb(&state);
+
+out_unlock:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
+ unsigned long vaddr_end, int op)
+{
+ struct psc_hdr *hdr;
+ struct psc_entry *e;
+ unsigned long pfn;
+ int i;
+
+ hdr = &data->hdr;
+ e = data->entries;
+
+ memset(data, 0, sizeof(*data));
+ i = 0;
+
+ while (vaddr < vaddr_end) {
+ if (is_vmalloc_addr((void *)vaddr))
+ pfn = vmalloc_to_pfn((void *)vaddr);
+ else
+ pfn = __pa(vaddr) >> PAGE_SHIFT;
+
+ e->gfn = pfn;
+ e->operation = op;
+ hdr->end_entry = i;
+
+ /*
+ * Current SNP implementation doesn't keep track of the RMP page
+ * size so use 4K for simplicity.
+ */
+ e->pagesize = RMP_PG_SIZE_4K;
+
+ vaddr = vaddr + PAGE_SIZE;
+ e++;
+ i++;
+ }
+
+ if (vmgexit_psc(data))
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
+}
+
+static void set_pages_state(unsigned long vaddr, unsigned int npages, int op)
+{
+ unsigned long vaddr_end, next_vaddr;
+ struct snp_psc_desc *desc;
+
+ desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT);
+ if (!desc)
+ panic("SNP: failed to allocate memory for PSC descriptor\n");
+
+ vaddr = vaddr & PAGE_MASK;
+ vaddr_end = vaddr + (npages << PAGE_SHIFT);
+
+ while (vaddr < vaddr_end) {
+ /* Calculate the last vaddr that fits in one struct snp_psc_desc. */
+ next_vaddr = min_t(unsigned long, vaddr_end,
+ (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr);
+
+ __set_pages_state(desc, vaddr, next_vaddr, op);
+
+ vaddr = next_vaddr;
+ }
+
+ kfree(desc);
+}
+
+void snp_set_memory_shared(unsigned long vaddr, unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ pvalidate_pages(vaddr, npages, false);
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
+}
+
+void snp_set_memory_private(unsigned long vaddr, unsigned int npages)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
+
+ pvalidate_pages(vaddr, npages, true);
+}
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
+#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
+#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
+
+#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
+#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
+
+static void *snp_alloc_vmsa_page(void)
+{
+ struct page *p;
+
+ /*
+ * Allocate VMSA page to work around the SNP erratum where the CPU will
+ * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
+ * collides with the RMP entry of VMSA page. The recommended workaround
+ * is to not use a large page.
+ *
+ * Allocate an 8k page which is also 8k-aligned.
+ */
+ p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
+ __free_page(p);
+
+ return page_address(p + 1);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
+{
+ struct sev_es_save_area *cur_vmsa, *vmsa;
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ u8 sipi_vector;
+ int cpu, ret;
+ u64 cr4;
+
+ /*
+ * The hypervisor SNP feature support check has happened earlier, just check
+ * the AP_CREATION one here.
+ */
+ if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
+ return -EOPNOTSUPP;
+
+ /*
+ * Verify the desired start IP against the known trampoline start IP
+ * to catch any future new trampolines that may be introduced that
+ * would require a new protected guest entry point.
+ */
+ if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
+ "Unsupported SNP start_ip: %lx\n", start_ip))
+ return -EINVAL;
+
+ /* Override start_ip with known protected guest start IP */
+ start_ip = real_mode_header->sev_es_trampoline_start;
+
+ /* Find the logical CPU for the APIC ID */
+ for_each_present_cpu(cpu) {
+ if (arch_match_cpu_phys_id(cpu, apic_id))
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ cur_vmsa = per_cpu(sev_vmsa, cpu);
+
+ /*
+ * A new VMSA is created each time because there is no guarantee that
+ * the current VMSA is the kernels or that the vCPU is not running. If
+ * an attempt was done to use the current VMSA with a running vCPU, a
+ * #VMEXIT of that vCPU would wipe out all of the settings being done
+ * here.
+ */
+ vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
+ if (!vmsa)
+ return -ENOMEM;
+
+ /* CR4 should maintain the MCE value */
+ cr4 = native_read_cr4() & X86_CR4_MCE;
+
+ /* Set the CS value based on the start_ip converted to a SIPI vector */
+ sipi_vector = (start_ip >> 12);
+ vmsa->cs.base = sipi_vector << 12;
+ vmsa->cs.limit = AP_INIT_CS_LIMIT;
+ vmsa->cs.attrib = INIT_CS_ATTRIBS;
+ vmsa->cs.selector = sipi_vector << 8;
+
+ /* Set the RIP value based on start_ip */
+ vmsa->rip = start_ip & 0xfff;
+
+ /* Set AP INIT defaults as documented in the APM */
+ vmsa->ds.limit = AP_INIT_DS_LIMIT;
+ vmsa->ds.attrib = INIT_DS_ATTRIBS;
+ vmsa->es = vmsa->ds;
+ vmsa->fs = vmsa->ds;
+ vmsa->gs = vmsa->ds;
+ vmsa->ss = vmsa->ds;
+
+ vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
+ vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
+ vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
+ vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
+ vmsa->tr.limit = AP_INIT_TR_LIMIT;
+ vmsa->tr.attrib = INIT_TR_ATTRIBS;
+
+ vmsa->cr4 = cr4;
+ vmsa->cr0 = AP_INIT_CR0_DEFAULT;
+ vmsa->dr7 = DR7_RESET_VALUE;
+ vmsa->dr6 = AP_INIT_DR6_DEFAULT;
+ vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
+ vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
+ vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
+ vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
+ vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
+ vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
+
+ /* SVME must be set. */
+ vmsa->efer = EFER_SVME;
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ /* Switch the page over to a VMSA page now that it is initialized */
+ ret = snp_set_vmsa(vmsa, true);
+ if (ret) {
+ pr_err("set VMSA page failed (%u)\n", ret);
+ free_page((unsigned long)vmsa);
+
+ return -EINVAL;
+ }
+
+ /* Issue VMGEXIT AP Creation NAE event */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_rax(ghcb, vmsa->sev_features);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
+ ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
+ ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
+ lower_32_bits(ghcb->save.sw_exit_info_1)) {
+ pr_err("SNP AP Creation error\n");
+ ret = -EINVAL;
+ }
+
+ __sev_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ /* Perform cleanup if there was an error */
+ if (ret) {
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+void snp_set_wakeup_secondary_cpu(void)
+{
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return;
+
+ /*
+ * Always set this override if SNP is enabled. This makes it the
+ * required method to start APs under SNP. If the hypervisor does
+ * not support AP creation, then no APs will be started.
+ */
+ apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit;
+}
+
+int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
{
u16 startup_cs, startup_ip;
phys_addr_t jump_table_pa;
@@ -644,15 +1223,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
return ret;
}
-/*
- * This function runs on the first #VC exception after the kernel
- * switched to virtual addresses.
- */
-static bool __init sev_es_setup_ghcb(void)
+static void snp_register_per_cpu_ghcb(void)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ snp_register_ghcb_early(__pa(ghcb));
+}
+
+void setup_ghcb(void)
{
+ if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ return;
+
/* First make sure the hypervisor talks a supported protocol. */
if (!sev_es_negotiate_protocol())
- return false;
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+
+ /*
+ * Check whether the runtime #VC exception handler is active. It uses
+ * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
+ *
+ * If SNP is active, register the per-CPU GHCB page so that the runtime
+ * exception handler can use it.
+ */
+ if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_per_cpu_ghcb();
+
+ return;
+ }
/*
* Clear the boot_ghcb. The first exception comes in before the bss
@@ -663,7 +1266,9 @@ static bool __init sev_es_setup_ghcb(void)
/* Alright - Make the boot-ghcb public */
boot_ghcb = &boot_ghcb_page;
- return true;
+ /* SNP guest requires that GHCB GPA must be registered. */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ snp_register_ghcb_early(__pa(&boot_ghcb_page));
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -766,6 +1371,17 @@ void __init sev_es_init_vc_handling(void)
if (!sev_es_check_cpu_features())
panic("SEV-ES CPU Features missing");
+ /*
+ * SNP is supported in v2 of the GHCB spec which mandates support for HV
+ * features.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ sev_hv_features = get_hv_features();
+
+ if (!(sev_hv_features & GHCB_HV_FT_SNP))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+ }
+
/* Enable SEV-ES special handling */
static_branch_enable(&sev_es_enable_key);
@@ -1337,7 +1953,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
@@ -1383,10 +1999,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
struct es_em_ctxt ctxt;
enum es_result result;
- /* Do initial setup or terminate the guest */
- if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
-
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -1425,6 +2037,215 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
fail:
show_regs(regs);
- while (true)
- halt();
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
+}
+
+/*
+ * Initial set up of SNP relies on information provided by the
+ * Confidential Computing blob, which can be passed to the kernel
+ * in the following ways, depending on how it is booted:
+ *
+ * - when booted via the boot/decompress kernel:
+ * - via boot_params
+ *
+ * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
+ * - via a setup_data entry, as defined by the Linux Boot Protocol
+ *
+ * Scan for the blob in that order.
+ */
+static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ /* Boot kernel would have passed the CC blob via boot_params. */
+ if (bp->cc_blob_address) {
+ cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
+ goto found_cc_info;
+ }
+
+ /*
+ * If kernel was booted directly, without the use of the
+ * boot/decompression kernel, the CC blob may have been passed via
+ * setup_data instead.
+ */
+ cc_info = find_cc_blob_setup_data(bp);
+ if (!cc_info)
+ return NULL;
+
+found_cc_info:
+ if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
+ snp_abort();
+
+ return cc_info;
+}
+
+bool __init snp_init(struct boot_params *bp)
+{
+ struct cc_blob_sev_info *cc_info;
+
+ if (!bp)
+ return false;
+
+ cc_info = find_cc_blob(bp);
+ if (!cc_info)
+ return false;
+
+ setup_cpuid_table(cc_info);
+
+ /*
+ * The CC blob will be used later to access the secrets page. Cache
+ * it here like the boot kernel does.
+ */
+ bp->cc_blob_address = (u32)(unsigned long)cc_info;
+
+ return true;
+}
+
+void __init snp_abort(void)
+{
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+}
+
+static void dump_cpuid_table(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+ int i = 0;
+
+ pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
+ cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
+
+ for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
+ const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
+
+ pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
+ i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
+ fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
+ }
+}
+
+/*
+ * It is useful from an auditing/testing perspective to provide an easy way
+ * for the guest owner to know that the CPUID table has been initialized as
+ * expected, but that initialization happens too early in boot to print any
+ * sort of indicator, and there's not really any other good place to do it,
+ * so do it here.
+ */
+static int __init report_cpuid_table(void)
+{
+ const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
+
+ if (!cpuid_table->count)
+ return 0;
+
+ pr_info("Using SNP CPUID table, %d entries present.\n",
+ cpuid_table->count);
+
+ if (sev_cfg.debug)
+ dump_cpuid_table();
+
+ return 0;
+}
+arch_initcall(report_cpuid_table);
+
+static int __init init_sev_config(char *str)
+{
+ char *s;
+
+ while ((s = strsep(&str, ","))) {
+ if (!strcmp(s, "debug")) {
+ sev_cfg.debug = true;
+ continue;
+ }
+
+ pr_info("SEV command-line option '%s' was not recognized\n", s);
+ }
+
+ return 1;
+}
+__setup("sev=", init_sev_config);
+
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
+{
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ int ret;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ if (!fw_err)
+ return -EINVAL;
+
+ /*
+ * __sev_get_ghcb() needs to run with IRQs disabled because it is using
+ * a per-CPU GHCB.
+ */
+ local_irq_save(flags);
+
+ ghcb = __sev_get_ghcb(&state);
+ if (!ghcb) {
+ ret = -EIO;
+ goto e_restore_irq;
+ }
+
+ vc_ghcb_invalidate(ghcb);
+
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ ghcb_set_rax(ghcb, input->data_gpa);
+ ghcb_set_rbx(ghcb, input->data_npages);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
+ if (ret)
+ goto e_put;
+
+ if (ghcb->save.sw_exit_info_2) {
+ /* Number of expected pages are returned in RBX */
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST &&
+ ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN)
+ input->data_npages = ghcb_get_rbx(ghcb);
+
+ *fw_err = ghcb->save.sw_exit_info_2;
+
+ ret = -EIO;
+ }
+
+e_put:
+ __sev_put_ghcb(&state);
+e_restore_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(snp_issue_guest_request);
+
+static struct platform_device sev_guest_device = {
+ .name = "sev-guest",
+ .id = -1,
+};
+
+static int __init snp_init_platform_device(void)
+{
+ struct sev_guest_platform_data data;
+ u64 gpa;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ return -ENODEV;
+
+ gpa = get_secrets_page();
+ if (!gpa)
+ return -ENODEV;
+
+ data.secrets_gpa = gpa;
+ if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
+ return -ENODEV;
+
+ if (platform_device_register(&sev_guest_device))
+ return -ENODEV;
+
+ pr_info("SNP guest platform device initialized.\n");
+ return 0;
}
+device_initcall(snp_init_platform_device);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e439eb14325f..9c7265b524c7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -93,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs,
return false;
#ifdef CONFIG_X86_32
- set_user_gs(regs, sc.gs);
+ loadsegment(gs, sc.gs);
regs->fs = sc.fs;
regs->es = sc.es;
regs->ds = sc.ds;
@@ -146,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
struct pt_regs *regs, unsigned long mask)
{
#ifdef CONFIG_X86_32
- unsafe_put_user(get_user_gs(regs),
- (unsigned int __user *)&sc->gs, Efault);
+ unsigned int gs;
+ savesegment(gs, gs);
+
+ unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault);
unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index b52407c56000..879ef8c72f5c 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -149,8 +149,10 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18);
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2ef14772dc04..5e7f9532a10d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,7 +56,6 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
-#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -82,6 +81,7 @@
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
+#include <asm/sev.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -187,7 +187,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance(true, false);
+ ap_init_aperfmperf();
/*
* Get our bogomips.
@@ -1082,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
unsigned long boot_error = 0;
unsigned long timeout;
+#ifdef CONFIG_X86_64
+ /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */
+ if (apic->wakeup_secondary_cpu_64)
+ start_ip = real_mode_header->trampoline_start64;
+#endif
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
@@ -1123,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
/*
* Wake up a CPU in difference cases:
- * - Use the method in the APIC driver if it's defined
+ * - Use a method from the APIC driver if one defined, with wakeup
+ * straight to 64-bit mode preferred over wakeup to RM.
* Otherwise,
* - Use an INIT boot APIC message for APs or NMI for BSP.
*/
- if (apic->wakeup_secondary_cpu)
+ if (apic->wakeup_secondary_cpu_64)
+ boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip);
+ else if (apic->wakeup_secondary_cpu)
boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
else
boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
@@ -1397,7 +1405,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
smp_prepare_cpus_common();
- init_freq_invariance(false, false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1430,6 +1437,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
smp_quirk_init_udelay();
speculative_store_bypass_ht_init();
+
+ snp_set_wakeup_secondary_cpu();
}
void arch_thaw_secondary_cpus_begin(void)
@@ -1847,357 +1856,3 @@ void native_play_dead(void)
}
#endif
-
-#ifdef CONFIG_X86_64
-/*
- * APERF/MPERF frequency ratio computation.
- *
- * The scheduler wants to do frequency invariant accounting and needs a <1
- * ratio to account for the 'current' frequency, corresponding to
- * freq_curr / freq_max.
- *
- * Since the frequency freq_curr on x86 is controlled by micro-controller and
- * our P-state setting is little more than a request/hint, we need to observe
- * the effective frequency 'BusyMHz', i.e. the average frequency over a time
- * interval after discarding idle time. This is given by:
- *
- * BusyMHz = delta_APERF / delta_MPERF * freq_base
- *
- * where freq_base is the max non-turbo P-state.
- *
- * The freq_max term has to be set to a somewhat arbitrary value, because we
- * can't know which turbo states will be available at a given point in time:
- * it all depends on the thermal headroom of the entire package. We set it to
- * the turbo level with 4 cores active.
- *
- * Benchmarks show that's a good compromise between the 1C turbo ratio
- * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
- * which would ignore the entire turbo range (a conspicuous part, making
- * freq_curr/freq_max always maxed out).
- *
- * An exception to the heuristic above is the Atom uarch, where we choose the
- * highest turbo level for freq_max since Atom's are generally oriented towards
- * power efficiency.
- *
- * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
- * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
- */
-
-DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
-
-static DEFINE_PER_CPU(u64, arch_prev_aperf);
-static DEFINE_PER_CPU(u64, arch_prev_mperf);
-static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
-static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
-
-void arch_set_max_freq_ratio(bool turbo_disabled)
-{
- arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
- arch_turbo_freq_ratio;
-}
-EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
-
-static bool turbo_disabled(void)
-{
- u64 misc_en;
- int err;
-
- err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
- if (err)
- return false;
-
- return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
-}
-
-static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
-{
- int err;
-
- err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
- *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
-
- return true;
-}
-
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
-
-static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
- {}
-};
-
-static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
- X86_MATCH(SKYLAKE_X),
- {}
-};
-
-static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
- {}
-};
-
-static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
- int num_delta_fratio)
-{
- int fratio, delta_fratio, found;
- int err, i;
- u64 msr;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
- if (err)
- return false;
-
- fratio = (msr >> 8) & 0xFF;
- i = 16;
- found = 0;
- do {
- if (found >= num_delta_fratio) {
- *turbo_freq = fratio;
- return true;
- }
-
- delta_fratio = (msr >> (i + 5)) & 0x7;
-
- if (delta_fratio) {
- found += 1;
- fratio -= delta_fratio;
- }
-
- i += 8;
- } while (i < 64);
-
- return true;
-}
-
-static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
-{
- u64 ratios, counts;
- u32 group_size;
- int err, i;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
- if (err)
- return false;
-
- for (i = 0; i < 64; i += 8) {
- group_size = (counts >> i) & 0xFF;
- if (group_size >= size) {
- *turbo_freq = (ratios >> i) & 0xFF;
- return true;
- }
- }
-
- return false;
-}
-
-static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
-{
- u64 msr;
- int err;
-
- err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
- if (err)
- return false;
-
- err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
- if (err)
- return false;
-
- *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
- *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
-
- /* The CPU may have less than 4 cores */
- if (!*turbo_freq)
- *turbo_freq = msr & 0xFF; /* 1C turbo */
-
- return true;
-}
-
-static bool intel_set_max_freq_ratio(void)
-{
- u64 base_freq, turbo_freq;
- u64 turbo_ratio;
-
- if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
- goto out;
-
- if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
- skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
- goto out;
-
- if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
- knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
- goto out;
-
- if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
- skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
- goto out;
-
- if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
- goto out;
-
- return false;
-
-out:
- /*
- * Some hypervisors advertise X86_FEATURE_APERFMPERF
- * but then fill all MSR's with zeroes.
- * Some CPUs have turbo boost but don't declare any turbo ratio
- * in MSR_TURBO_RATIO_LIMIT.
- */
- if (!base_freq || !turbo_freq) {
- pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
- return false;
- }
-
- turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
- if (!turbo_ratio) {
- pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
- return false;
- }
-
- arch_turbo_freq_ratio = turbo_ratio;
- arch_set_max_freq_ratio(turbo_disabled());
-
- return true;
-}
-
-static void init_counter_refs(void)
-{
- u64 aperf, mperf;
-
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
-
- this_cpu_write(arch_prev_aperf, aperf);
- this_cpu_write(arch_prev_mperf, mperf);
-}
-
-#ifdef CONFIG_PM_SLEEP
-static struct syscore_ops freq_invariance_syscore_ops = {
- .resume = init_counter_refs,
-};
-
-static void register_freq_invariance_syscore_ops(void)
-{
- /* Bail out if registered already. */
- if (freq_invariance_syscore_ops.node.prev)
- return;
-
- register_syscore_ops(&freq_invariance_syscore_ops);
-}
-#else
-static inline void register_freq_invariance_syscore_ops(void) {}
-#endif
-
-void init_freq_invariance(bool secondary, bool cppc_ready)
-{
- bool ret = false;
-
- if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
- return;
-
- if (secondary) {
- if (static_branch_likely(&arch_scale_freq_key)) {
- init_counter_refs();
- }
- return;
- }
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- ret = intel_set_max_freq_ratio();
- else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (!cppc_ready) {
- return;
- }
- ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio);
- }
-
- if (ret) {
- init_counter_refs();
- static_branch_enable(&arch_scale_freq_key);
- register_freq_invariance_syscore_ops();
- pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
- } else {
- pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
- }
-}
-
-static void disable_freq_invariance_workfn(struct work_struct *work)
-{
- static_branch_disable(&arch_scale_freq_key);
-}
-
-static DECLARE_WORK(disable_freq_invariance_work,
- disable_freq_invariance_workfn);
-
-DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
-
-void arch_scale_freq_tick(void)
-{
- u64 freq_scale;
- u64 aperf, mperf;
- u64 acnt, mcnt;
-
- if (!arch_scale_freq_invariant())
- return;
-
- rdmsrl(MSR_IA32_APERF, aperf);
- rdmsrl(MSR_IA32_MPERF, mperf);
-
- acnt = aperf - this_cpu_read(arch_prev_aperf);
- mcnt = mperf - this_cpu_read(arch_prev_mperf);
-
- this_cpu_write(arch_prev_aperf, aperf);
- this_cpu_write(arch_prev_mperf, mperf);
-
- if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
- goto error;
-
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
- goto error;
-
- freq_scale = div64_u64(acnt, mcnt);
- if (!freq_scale)
- goto error;
-
- if (freq_scale > SCHED_CAPACITY_SCALE)
- freq_scale = SCHED_CAPACITY_SCALE;
-
- this_cpu_write(arch_freq_scale, freq_scale);
- return;
-
-error:
- pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
- schedule_work(&disable_freq_invariance_work);
-}
-#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 531fb4cbb63f..aa72cefdd5be 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -12,10 +12,9 @@ enum insn_type {
};
/*
- * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
- * The REX.W cancels the effect of any data16.
+ * cs cs cs xorl %eax, %eax - a single 5 byte instruction that clears %[er]ax
*/
-static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 660b78827638..8cc653ffdccd 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -68,9 +68,6 @@ static int __init control_va_addr_alignment(char *str)
if (*str == 0)
return 1;
- if (*str == '=')
- str++;
-
if (!strcmp(str, "32"))
va_align.flags = ALIGN_VA_32;
else if (!strcmp(str, "64"))
@@ -80,11 +77,11 @@ static int __init control_va_addr_alignment(char *str)
else if (!strcmp(str, "on"))
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
else
- return 0;
+ pr_warn("invalid option value: 'align_va_addr=%s'\n", str);
return 1;
}
-__setup("align_va_addr", control_va_addr_alignment);
+__setup("align_va_addr=", control_va_addr_alignment);
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1563fb995005..d62b2cb85cea 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/tdx.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -686,13 +687,40 @@ static bool try_fixup_enqcmd_gp(void)
#endif
}
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
- struct task_struct *tsk;
unsigned long gp_addr;
- int ret;
if (user_mode(regs) && try_fixup_enqcmd_gp())
return;
@@ -711,40 +739,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
return;
}
- tsk = current;
-
if (user_mode(regs)) {
if (fixup_iopl_exception(regs))
goto exit;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
@@ -892,14 +898,10 @@ sync:
}
#endif
-struct bad_iret_stack {
- void *error_entry_ret;
- struct pt_regs regs;
-};
-
-asmlinkage __visible noinstr
-struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
{
+ struct pt_regs tmp, *new_stack;
+
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
@@ -908,19 +910,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
- struct bad_iret_stack tmp, *new_stack =
- (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the temporary storage. */
- __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
+ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));
/* Update the entry stack */
__memcpy(new_stack, &tmp, sizeof(tmp));
- BUG_ON(!user_mode(&new_stack->regs));
+ BUG_ON(!user_mode(new_stack));
return new_stack;
}
#endif
@@ -1343,6 +1344,91 @@ DEFINE_IDTENTRY(exc_device_not_available)
}
}
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code)
+{
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
+ return;
+
+ die_addr(VE_FAULT_STR, regs, error_code, 0);
+}
+
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+ struct ve_info ve;
+
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
+
+ cond_local_irq_enable(regs);
+
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0);
+
+ cond_local_irq_disable(regs);
+}
+
+#endif
+
#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 794fdef2501a..38185aedf7d1 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -339,11 +339,11 @@ static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
struct stack_info *info = &state->stack_info;
void *addr = (void *)_addr;
- if (!on_stack(info, addr, len) &&
- (get_stack_info(addr, state->task, info, &state->stack_mask)))
- return false;
+ if (on_stack(info, addr, len))
+ return true;
- return true;
+ return !get_stack_info(addr, state->task, info, &state->stack_mask) &&
+ on_stack(info, addr, len);
}
static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index c21bcd668284..e9e803a4d44c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -151,7 +151,7 @@ exit_vm86:
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
- lazy_load_gs(vm86->regs32.gs);
+ loadsegment(gs, vm86->regs32.gs);
regs->pt.ax = retval;
return;
@@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
* Save old state
*/
vm86->saved_sp0 = tsk->thread.sp0;
- lazy_save_gs(vm86->regs32.gs);
+ savesegment(gs, vm86->regs32.gs);
/* make room for real-mode segments */
preempt_disable();
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b24ca7f4ed7c..de6d44e07e34 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -19,6 +19,7 @@
#include <asm/user.h>
#include <asm/fpu/xstate.h>
#include <asm/sgx.h>
+#include <asm/cpuid.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
@@ -744,24 +745,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
cpuid_count(entry->function, entry->index,
&entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
- switch (function) {
- case 4:
- case 7:
- case 0xb:
- case 0xd:
- case 0xf:
- case 0x10:
- case 0x12:
- case 0x14:
- case 0x17:
- case 0x18:
- case 0x1d:
- case 0x1e:
- case 0x1f:
- case 0x8000001d:
+ if (cpuid_function_is_indexed(function))
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- break;
- }
return entry;
}
@@ -887,6 +872,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
@@ -1085,12 +1075,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 0x80000000:
entry->eax = min(entry->eax, 0x80000021);
/*
- * Serializing LFENCE is reported in a multitude of ways,
- * and NullSegClearsBase is not reported in CPUID on Zen2;
- * help userspace by providing the CPUID leaf ourselves.
+ * Serializing LFENCE is reported in a multitude of ways, and
+ * NullSegClearsBase is not reported in CPUID on Zen2; help
+ * userspace by providing the CPUID leaf ourselves.
+ *
+ * However, only do it if the host has CPUID leaf 0x8000001d.
+ * QEMU thinks that it can query the host blindly for that
+ * CPUID leaf if KVM reports that it supports 0x8000001d or
+ * above. The processor merrily returns values from the
+ * highest Intel leaf which QEMU tries to use as the guest's
+ * 0x8000001d. Even worse, this can result in an infinite
+ * loop if said highest leaf has no subleaves indexed by ECX.
*/
- if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
- || !static_cpu_has_bug(X86_BUG_NULL_SEG))
+ if (entry->eax >= 0x8000001d &&
+ (static_cpu_has(X86_FEATURE_LFENCE_RDTSC)
+ || !static_cpu_has_bug(X86_BUG_NULL_SEG)))
entry->eax = max(entry->eax, 0x80000021);
break;
case 0x80000001:
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 123b677111c5..a0702b6be3e8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1135,11 +1135,13 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
+ mutex_lock(&hv->hv_lock);
+
if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_SET ||
hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
- return;
+ goto out_unlock;
- mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1201,45 +1203,19 @@ out_unlock:
mutex_unlock(&hv->hv_lock);
}
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+void kvm_hv_request_tsc_page_update(struct kvm *kvm)
{
struct kvm_hv *hv = to_kvm_hv(kvm);
- u64 gfn;
- int idx;
-
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
- hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
- tsc_page_update_unsafe(hv))
- return;
mutex_lock(&hv->hv_lock);
- if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- goto out_unlock;
-
- /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
- if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
- hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET &&
+ !tsc_page_update_unsafe(hv))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
- gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-
- hv->tsc_ref.tsc_sequence = 0;
-
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&kvm->srcu);
- if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
- &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
- srcu_read_unlock(&kvm->srcu, idx);
-
-out_unlock:
mutex_unlock(&hv->hv_lock);
}
-
static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
{
if (!hv_vcpu->enforce_cpuid)
@@ -1938,7 +1914,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
- unsigned long valid_bank_mask;
+ u64 valid_bank_mask;
u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
@@ -1980,7 +1956,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
- if (hc->var_cnt != bitmap_weight(&valid_bank_mask, 64))
+ if (hc->var_cnt != bitmap_weight((unsigned long *)&valid_bank_mask, 64))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
if (all_cpus)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e19c00ee9ab3..da2737f2a956 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -137,7 +137,7 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
-void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
+void kvm_hv_request_tsc_page_update(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e6cae6f22683..a335e7f1f69e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -65,6 +65,30 @@ static __always_inline u64 rsvd_bits(int s, int e)
return ((2ULL << (e - s)) - 1) << s;
}
+/*
+ * The number of non-reserved physical address bits irrespective of features
+ * that repurpose legal bits, e.g. MKTME.
+ */
+extern u8 __read_mostly shadow_phys_bits;
+
+static inline gfn_t kvm_mmu_max_gfn(void)
+{
+ /*
+ * Note that this uses the host MAXPHYADDR, not the guest's.
+ * EPT/NPT cannot support GPAs that would exceed host.MAXPHYADDR;
+ * assuming KVM is running on bare metal, guest accesses beyond
+ * host.MAXPHYADDR will hit a #PF(RSVD) and never cause a vmexit
+ * (either EPT Violation/Misconfig or #NPF), and so KVM will never
+ * install a SPTE for such addresses. If KVM is running as a VM
+ * itself, on the other hand, it might see a MAXPHYADDR that is less
+ * than hardware's real MAXPHYADDR. Using the host MAXPHYADDR
+ * disallows such SPTEs entirely and simplifies the TDP MMU.
+ */
+ int max_gpa_bits = likely(tdp_enabled) ? shadow_phys_bits : 52;
+
+ return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
+}
+
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8f19ea752704..45e1573f8f1d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -473,30 +473,6 @@ retry:
}
#endif
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!is_shadow_present_pte(spte))
- return false;
-
- /*
- * Always atomically update spte if it can be updated
- * out of mmu-lock, it can ensure dirty bit is not lost,
- * also, it can help us to get a stable is_writable_pte()
- * to ensure tlb flush is not missed.
- */
- if (spte_can_locklessly_be_made_writable(spte) ||
- is_access_track_spte(spte))
- return true;
-
- if (spte_ad_enabled(spte)) {
- if ((spte & shadow_accessed_mask) == 0 ||
- (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
- return true;
- }
-
- return false;
-}
-
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -557,7 +533,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_can_locklessly_be_made_writable(old_spte) &&
+ if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte))
flush = true;
@@ -591,7 +567,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
- if (!spte_has_volatile_bits(old_spte))
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
@@ -1187,7 +1164,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ !(pt_protect && is_mmu_writable_spte(spte)))
return false;
rmap_printk("spte %p %llx\n", sptep, *sptep);
@@ -2804,8 +2781,12 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
const struct kvm_memory_slot *slot)
{
unsigned long hva;
- pte_t *pte;
- int level;
+ unsigned long flags;
+ int level = PG_LEVEL_4K;
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
return PG_LEVEL_4K;
@@ -2820,10 +2801,43 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
- pte = lookup_address_in_mm(kvm->mm, hva, &level);
- if (unlikely(!pte))
- return PG_LEVEL_4K;
+ /*
+ * Lookup the mapping level in the current mm. The information
+ * may become stale soon, but it is safe to use as long as
+ * 1) mmu_notifier_retry was checked after taking mmu_lock, and
+ * 2) mmu_lock is taken now.
+ *
+ * We still need to disable IRQs to prevent concurrent tear down
+ * of page tables.
+ */
+ local_irq_save(flags);
+
+ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
+ if (pgd_none(pgd))
+ goto out;
+ p4d = READ_ONCE(*p4d_offset(&pgd, hva));
+ if (p4d_none(p4d) || !p4d_present(p4d))
+ goto out;
+
+ pud = READ_ONCE(*pud_offset(&p4d, hva));
+ if (pud_none(pud) || !pud_present(pud))
+ goto out;
+
+ if (pud_large(pud)) {
+ level = PG_LEVEL_1G;
+ goto out;
+ }
+
+ pmd = READ_ONCE(*pmd_offset(&pud, hva));
+ if (pmd_none(pmd) || !pmd_present(pmd))
+ goto out;
+
+ if (pmd_large(pmd))
+ level = PG_LEVEL_2M;
+
+out:
+ local_irq_restore(flags);
return level;
}
@@ -2992,9 +3006,15 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
- * MMIO SPTE will just be an expensive nop.
+ * MMIO SPTE will just be an expensive nop. Do not cache MMIO
+ * whose gfn is greater than host.MAXPHYADDR, any guest that
+ * generates such gfns is running nested and is being tricked
+ * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
+ * and only if L1's MAXPHYADDR is inaccurate with respect to
+ * the hardware's).
*/
- if (unlikely(!shadow_mmio_value)) {
+ if (unlikely(!shadow_mmio_value) ||
+ unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE;
return true;
}
@@ -3153,8 +3173,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking.
*/
- if (fault->write &&
- spte_can_locklessly_be_made_writable(spte)) {
+ if (fault->write && is_mmu_writable_spte(spte)) {
new_spte |= PT_WRITABLE_MASK;
/*
@@ -5451,14 +5470,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
uint i;
if (pcid == kvm_get_active_pcid(vcpu)) {
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->root.hpa);
tlb_flush = true;
}
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
tlb_flush = true;
}
}
@@ -5646,6 +5667,7 @@ static void kvm_zap_obsolete_pages(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
+ bool unstable;
restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5677,11 +5699,12 @@ restart:
goto restart;
}
- if (__kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
- batch += nr_zapped;
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
goto restart;
- }
}
/*
@@ -6237,12 +6260,24 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
return 0;
}
-int kvm_mmu_module_init(void)
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ */
+void kvm_mmu_x86_module_init(void)
{
- int ret = -ENOMEM;
-
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
/*
* MMU roles use union aliasing which is, generally speaking, an
@@ -6290,7 +6325,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4739b53c9734..e5c0b6db6f2c 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
+{
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ if (is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if (!(spte & shadow_accessed_mask) ||
+ (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+ return true;
+ }
+
+ return false;
+}
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 73f12615416f..80ab0f5cff01 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -201,12 +201,6 @@ static inline bool is_removed_spte(u64 spte)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
-/*
- * The number of non-reserved physical address bits irrespective of features
- * that repurpose legal bits, e.g. MKTME.
- */
-extern u8 __read_mostly shadow_phys_bits;
-
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@@ -396,7 +390,7 @@ static inline void check_spte_writable_invariants(u64 spte)
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+static inline bool is_mmu_writable_spte(u64 spte)
{
return spte & shadow_mmu_writable_mask;
}
@@ -410,6 +404,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
+bool spte_has_volatile_bits(u64 spte);
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1eaf6ec0e0b..f0af385c56e0 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include "mmu.h"
+#include "spte.h"
/*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
return READ_ONCE(*rcu_dereference(sptep));
}
-static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
{
- WRITE_ONCE(*rcu_dereference(sptep), val);
+ /*
+ * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+ * volatile bits, i.e. has bits that can be set outside of mmu_lock.
+ * The Writable bit can be set by KVM's fast page fault handler, and
+ * Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+ if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d71d177ae6b8..922b06bf4b94 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -51,7 +51,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
if (!kvm->arch.tdp_mmu_enabled)
return;
- flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
+ /* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- u64 *sptep = rcu_dereference(pt) + i;
+ tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
- u64 old_child_spte;
+ u64 old_spte;
if (shared) {
/*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value.
*/
for (;;) {
- old_child_spte = xchg(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_child_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_spte))
break;
cpu_relax();
}
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* are guarded by the memslots generation, not by being
* unreachable.
*/
- old_child_spte = READ_ONCE(*sptep);
- if (!is_shadow_present_pte(old_child_spte))
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
continue;
/*
- * Marking the SPTE as a removed SPTE is not
- * strictly necessary here as the MMU lock will
- * stop other threads from concurrently modifying
- * this SPTE. Using the removed SPTE value keeps
- * the two branches consistent and simplifies
- * the function.
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a removed SPTE is not
+ * strictly necessary for the same reason, but using
+ * the remove SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to REMOVED_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
*/
- WRITE_ONCE(*sptep, REMOVED_SPTE);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ REMOVED_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level,
- shared);
+ old_spte, REMOVED_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
KVM_PAGES_PER_HPAGE(iter->level));
/*
- * No other thread can overwrite the removed SPTE as they
- * must either wait on the MMU lock or use
- * tdp_mmu_set_spte_atomic which will not overwrite the
- * special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present
- * to non-present.
+ * No other thread can overwrite the removed SPTE as they must either
+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+ * overwrite the special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present to non-present. Use
+ * the raw write helper to avoid an unnecessary check on volatile bits.
*/
- kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, 0);
return 0;
}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* unless performing certain dirty logging operations.
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
*/
-static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
*/
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- kvm_tdp_mmu_write_spte(sptep, new_spte);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
if (record_dirty_log)
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
new_spte, level);
+ return old_spte;
}
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
{
WARN_ON_ONCE(iter->yielded);
- __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
- new_spte, iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
+ iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -815,14 +839,15 @@ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
return iter->yielded;
}
-static inline gfn_t tdp_mmu_max_gfn_host(void)
+static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
{
/*
- * Bound TDP MMU walks at host.MAXPHYADDR, guest accesses beyond that
- * will hit a #PF(RSVD) and never hit an EPT Violation/Misconfig / #NPF,
- * and so KVM will never install a SPTE for such addresses.
+ * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
+ * a gpa range that would exceed the max gfn, and KVM does not create
+ * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
+ * the slow emulation path every time.
*/
- return 1ULL << (shadow_phys_bits - PAGE_SHIFT);
+ return kvm_mmu_max_gfn() + 1;
}
static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
@@ -830,7 +855,7 @@ static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- gfn_t end = tdp_mmu_max_gfn_host();
+ gfn_t end = tdp_mmu_max_gfn_exclusive();
gfn_t start = 0;
for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
@@ -923,7 +948,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
{
struct tdp_iter iter;
- end = min(end, tdp_mmu_max_gfn_host());
+ end = min(end, tdp_mmu_max_gfn_exclusive());
lockdep_assert_held_write(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eca39f56c231..0604bc29f0b8 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -171,9 +171,12 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
-static int cmp_u64(const void *a, const void *b)
+static int cmp_u64(const void *pa, const void *pb)
{
- return *(__u64 *)a - *(__u64 *)b;
+ u64 a = *(u64 *)pa;
+ u64 b = *(u64 *)pb;
+
+ return (a > b) - (a < b);
}
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 9e66fba1d6a3..22992b049d38 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -138,6 +138,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
return sample_period;
}
+static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
+{
+ if (!pmc->perf_event || pmc->is_paused)
+ return;
+
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, pmc->counter));
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a1cf9c31273b..421619540ff9 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -837,7 +837,8 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
BIT(APICV_INHIBIT_REASON_IRQWIN) |
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_X2APIC) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_SEV);
return supported & BIT(reason);
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 24eb935b6f85..16a5ebb420cf 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -45,6 +45,22 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+/* duplicated from amd_f17h_perfmon_event_map. */
+static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+/* amd_pmc_perf_hw_id depends on these being the same size */
+static_assert(ARRAY_SIZE(amd_event_mapping) ==
+ ARRAY_SIZE(amd_f17h_event_mapping));
+
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -140,6 +156,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_event_hw_type_mapping *event_mapping;
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
@@ -148,15 +165,20 @@ static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
if (WARN_ON(pmc_is_fixed(pmc)))
return PERF_COUNT_HW_MAX;
+ if (guest_cpuid_family(pmc->vcpu) >= 0x17)
+ event_mapping = amd_f17h_event_mapping;
+ else
+ event_mapping = amd_event_mapping;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (amd_event_mapping[i].eventsel == event_select
- && amd_event_mapping[i].unit_mask == unit_mask)
+ if (event_mapping[i].eventsel == event_select
+ && event_mapping[i].unit_mask == unit_mask)
break;
if (i == ARRAY_SIZE(amd_event_mapping))
return PERF_COUNT_HW_MAX;
- return amd_event_mapping[i].event_type;
+ return event_mapping[i].event_type;
}
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
@@ -257,6 +279,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER);
if (pmc) {
pmc->counter += data - pmc_read_counter(pmc);
+ pmc_update_sample_period(pmc);
return 0;
}
/* MSR_EVNTSELn */
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 75fa6dd268f0..636c77ef55fc 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -260,6 +260,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
+
return 0;
e_free:
@@ -465,6 +467,7 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
page_virtual = kmap_atomic(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
kunmap_atomic(page_virtual);
+ cond_resched();
}
}
@@ -559,12 +562,20 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
- struct vmcb_save_area *save = &svm->vmcb->save;
+ struct sev_es_save_area *save = svm->sev_es.vmsa;
/* Check some debug related fields before encrypting the VMSA */
- if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1))
+ if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
return -EINVAL;
+ /*
+ * SEV-ES will use a VMSA that is pointed to by the VMCB, not
+ * the traditional VMSA that is part of the VMCB. Copy the
+ * traditional VMSA as it has been built so far (in prep
+ * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
+ */
+ memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save));
+
/* Sync registgers */
save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX];
save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX];
@@ -592,14 +603,6 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- /*
- * SEV-ES will use a VMSA that is pointed to by the VMCB, not
- * the traditional VMSA that is part of the VMCB. Copy the
- * traditional VMSA as it has been built so far (in prep
- * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state.
- */
- memcpy(svm->sev_es.vmsa, save, sizeof(*save));
-
return 0;
}
@@ -1591,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0);
}
+/* vCPU mutex subclasses. */
+enum sev_migration_role {
+ SEV_MIGRATION_SOURCE = 0,
+ SEV_MIGRATION_TARGET,
+ SEV_NR_MIGRATION_ROLES,
+};
-static int sev_lock_vcpus_for_migration(struct kvm *kvm)
+static int sev_lock_vcpus_for_migration(struct kvm *kvm,
+ enum sev_migration_role role)
{
struct kvm_vcpu *vcpu;
unsigned long i, j;
+ bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable(&vcpu->mutex))
+ if (mutex_lock_killable_nested(&vcpu->mutex, role))
goto out_unlock;
+
+ if (first) {
+ /*
+ * Reset the role to one that avoids colliding with
+ * the role used for the first vcpu mutex.
+ */
+ role = SEV_NR_MIGRATION_ROLES;
+ first = false;
+ } else {
+ mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
+ }
}
return 0;
out_unlock:
+
+ first = true;
kvm_for_each_vcpu(j, vcpu, kvm) {
if (i == j)
break;
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
+
+
mutex_unlock(&vcpu->mutex);
}
return -EINTR;
@@ -1618,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
+ bool first = true;
kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (first)
+ first = false;
+ else
+ mutex_acquire(&vcpu->mutex.dep_map,
+ SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
+
mutex_unlock(&vcpu->mutex);
}
}
@@ -1745,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}
- ret = sev_lock_vcpus_for_migration(kvm);
+ ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
if (ret)
goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm);
+ ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
if (ret)
goto out_dst_vcpu;
@@ -2223,51 +2260,47 @@ int sev_cpu_init(struct svm_cpu_data *sd)
* Pages used by hardware to hold guest encrypted state must be flushed before
* returning them to the system.
*/
-static void sev_flush_guest_memory(struct vcpu_svm *svm, void *va,
- unsigned long len)
+static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
{
+ int asid = to_kvm_svm(vcpu->kvm)->sev_info.asid;
+
/*
- * If hardware enforced cache coherency for encrypted mappings of the
- * same physical page is supported, nothing to do.
+ * Note! The address must be a kernel address, as regular page walk
+ * checks are performed by VM_PAGE_FLUSH, i.e. operating on a user
+ * address is non-deterministic and unsafe. This function deliberately
+ * takes a pointer to deter passing in a user address.
*/
- if (boot_cpu_has(X86_FEATURE_SME_COHERENT))
- return;
+ unsigned long addr = (unsigned long)va;
/*
- * If the VM Page Flush MSR is supported, use it to flush the page
- * (using the page virtual address and the guest ASID).
+ * If CPU enforced cache coherency for encrypted mappings of the
+ * same physical page is supported, use CLFLUSHOPT instead. NOTE: cache
+ * flush is still needed in order to work properly with DMA devices.
*/
- if (boot_cpu_has(X86_FEATURE_VM_PAGE_FLUSH)) {
- struct kvm_sev_info *sev;
- unsigned long va_start;
- u64 start, stop;
-
- /* Align start and stop to page boundaries. */
- va_start = (unsigned long)va;
- start = (u64)va_start & PAGE_MASK;
- stop = PAGE_ALIGN((u64)va_start + len);
-
- if (start < stop) {
- sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
+ if (boot_cpu_has(X86_FEATURE_SME_COHERENT)) {
+ clflush_cache_range(va, PAGE_SIZE);
+ return;
+ }
- while (start < stop) {
- wrmsrl(MSR_AMD64_VM_PAGE_FLUSH,
- start | sev->asid);
+ /*
+ * VM Page Flush takes a host virtual address and a guest ASID. Fall
+ * back to WBINVD if this faults so as not to make any problems worse
+ * by leaving stale encrypted data in the cache.
+ */
+ if (WARN_ON_ONCE(wrmsrl_safe(MSR_AMD64_VM_PAGE_FLUSH, addr | asid)))
+ goto do_wbinvd;
- start += PAGE_SIZE;
- }
+ return;
- return;
- }
+do_wbinvd:
+ wbinvd_on_all_cpus();
+}
- WARN(1, "Address overflow, using WBINVD\n");
- }
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
- /*
- * Hardware should always have one of the above features,
- * but if not, use WBINVD and issue a warning.
- */
- WARN_ONCE(1, "Using WBINVD to flush guest memory\n");
wbinvd_on_all_cpus();
}
@@ -2281,7 +2314,8 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
if (vcpu->arch.guest_state_protected)
- sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE);
+ sev_flush_encrypted_page(vcpu, svm->sev_es.vmsa);
+
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)
@@ -2932,7 +2966,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa)
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
{
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index bd4c64b362d2..17d334ef5430 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1270,8 +1270,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
vmsave(__sme_page_pa(sd->save_area));
if (sev_es_guest(vcpu->kvm)) {
- struct vmcb_save_area *hostsa;
- hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
+ struct sev_es_save_area *hostsa;
+ hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
sev_es_prepare_switch_to_guest(hostsa);
}
@@ -3117,8 +3117,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"tr:",
save01->tr.selector, save01->tr.attrib,
save01->tr.limit, save01->tr.base);
- pr_err("cpl: %d efer: %016llx\n",
- save->cpl, save->efer);
+ pr_err("vmpl: %d cpl: %d efer: %016llx\n",
+ save->vmpl, save->cpl, save->efer);
pr_err("%-15s %016llx %-13s %016llx\n",
"cr0:", save->cr0, "cr2:", save->cr2);
pr_err("%-15s %016llx %-13s %016llx\n",
@@ -4620,6 +4620,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f77a7d2d39dd..2d83845b9032 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -181,7 +181,7 @@ struct svm_nested_state {
struct vcpu_sev_es_state {
/* SEV-ES support */
- struct vmcb_save_area *vmsa;
+ struct sev_es_save_area *vmsa;
struct ghcb *ghcb;
struct kvm_host_map ghcb_map;
bool received_first_sipi;
@@ -609,6 +609,8 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
struct kvm_enc_region *range);
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
void pre_sev_run(struct vcpu_svm *svm, int cpu);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
@@ -620,7 +622,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa);
+void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
/* vmenter.S */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f18744f7ff82..856c87563883 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4618,6 +4618,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
}
+ if (vmx->nested.update_vmcs01_apicv_status) {
+ vmx->nested.update_vmcs01_apicv_status = false;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bc3f8512bb64..b82b6709d7a8 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -431,15 +431,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event && !pmc->is_paused)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event && !pmc->is_paused)
- perf_event_period(pmc->perf_event,
- get_sample_period(pmc, data));
+ pmc_update_sample_period(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04d170c4b61e..610355b9ccce 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4174,6 +4174,11 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_apicv_status = true;
+ return;
+ }
+
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
if (cpu_has_secondary_exec_ctrls()) {
if (kvm_vcpu_apicv_active(vcpu))
@@ -5467,7 +5472,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending;
+ (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
}
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9c6bfcd84008..b98c7e96697a 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -183,6 +183,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
+ bool update_vmcs01_apicv_status;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c0ca599a353..4790f0d7d40b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2901,7 +2901,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm)
static void kvm_update_masterclock(struct kvm *kvm)
{
- kvm_hv_invalidate_tsc_page(kvm);
+ kvm_hv_request_tsc_page_update(kvm);
kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
kvm_end_pvclock_update(kvm);
@@ -3113,8 +3113,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_set)
kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
- if (!v->vcpu_idx)
- kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -6241,7 +6240,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
return -EINVAL;
- kvm_hv_invalidate_tsc_page(kvm);
+ kvm_hv_request_tsc_page_update(kvm);
kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
@@ -8926,7 +8925,7 @@ int kvm_arch_init(void *opaque)
}
kvm_nr_uret_msrs = 0;
- r = kvm_mmu_module_init();
+ r = kvm_mmu_vendor_module_init();
if (r)
goto out_free_percpu;
@@ -8974,7 +8973,7 @@ void kvm_arch_exit(void)
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
- kvm_mmu_module_exit();
+ kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
@@ -9112,7 +9111,7 @@ static void kvm_apicv_init(struct kvm *kvm)
if (!enable_apicv)
set_or_clear_apicv_inhibit(inhibits,
- APICV_INHIBIT_REASON_ABSENT, true);
+ APICV_INHIBIT_REASON_DISABLE, true);
}
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
@@ -9890,6 +9889,11 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
+}
+
static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@@ -10016,12 +10020,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ vcpu->run->system_event.ndata = 0;
r = 0;
goto out;
}
@@ -10098,7 +10104,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Store vcpu->apicv_active before vcpu->mode. */
smp_store_release(&vcpu->mode, IN_GUEST_MODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
/*
* 1) We should set ->mode before checking ->requests. Please see
@@ -10129,7 +10135,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_wmb();
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
r = 1;
goto cancel_injection;
}
@@ -10255,7 +10261,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
local_irq_enable();
preempt_enable();
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
/*
* Profile KVM exit RIPs:
@@ -10285,7 +10291,7 @@ out:
}
/* Called within kvm->srcu read side. */
-static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
bool hv_timer;
@@ -10301,12 +10307,12 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
if (hv_timer)
kvm_lapic_switch_to_sw_timer(vcpu);
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu);
else
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
@@ -10348,7 +10354,6 @@ static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
- struct kvm *kvm = vcpu->kvm;
vcpu->arch.l1tf_flush_l1d = true;
@@ -10356,7 +10361,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
- r = vcpu_block(kvm, vcpu);
+ r = vcpu_block(vcpu);
}
if (r <= 0)
@@ -10375,9 +10380,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
}
if (__xfer_to_guest_mode_work_pending()) {
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
r = xfer_to_guest_mode_handle_work(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (r)
return r;
}
@@ -10388,12 +10393,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
- int r;
-
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- return r;
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -10485,7 +10485,6 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- struct kvm *kvm = vcpu->kvm;
int r;
vcpu_load(vcpu);
@@ -10493,7 +10492,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
r = -EINTR;
@@ -10505,9 +10504,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
*/
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_vcpu_block(vcpu);
- vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_vcpu_srcu_read_lock(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@@ -10568,7 +10567,7 @@ out:
if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
- srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+ kvm_vcpu_srcu_read_unlock(vcpu);
kvm_sigset_deactivate(vcpu);
vcpu_put(vcpu);
@@ -10986,6 +10985,9 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
+ if (!enable_apicv)
+ return;
+
down_write(&kvm->arch.apicv_update_lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -11197,8 +11199,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
- if (kvm_apicv_activated(vcpu->kvm))
+
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as
+ * this vCPU will not get notified of any changes until this
+ * vCPU is visible to other vCPUs (marked online and added to
+ * the set of vCPUs). Opportunistically mark APICv active as
+ * VMX in particularly is highly unlikely to have inhibits.
+ * Ignore the current per-VM APICv state so that vCPU creation
+ * is guaranteed to run with a deterministic value, the request
+ * will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
vcpu->arch.apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
} else
static_branch_inc(&kvm_has_noapic_vcpu);
@@ -11996,8 +12011,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
+ if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
+ return -EINVAL;
+
return kvm_alloc_memslot_metadata(kvm, new);
+ }
if (change == KVM_MR_FLAGS_ONLY)
memcpy(&new->arch, &old->arch, sizeof(old->arch));
@@ -12986,3 +13005,19 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 8ca5ecf16dc4..9dec1b38a98f 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -53,12 +53,12 @@
SYM_FUNC_START(copy_user_generic_unrolled)
ASM_STAC
cmpl $8,%edx
- jb 20f /* less then 8 bytes, go to byte copy loop */
+ jb .Lcopy_user_short_string_bytes
ALIGN_DESTINATION
movl %edx,%ecx
andl $63,%edx
shrl $6,%ecx
- jz .L_copy_short_string
+ jz copy_user_short_string
1: movq (%rsi),%r8
2: movq 1*8(%rsi),%r9
3: movq 2*8(%rsi),%r10
@@ -79,37 +79,11 @@ SYM_FUNC_START(copy_user_generic_unrolled)
leaq 64(%rdi),%rdi
decl %ecx
jnz 1b
-.L_copy_short_string:
- movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz 20f
-18: movq (%rsi),%r8
-19: movq %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-20: andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xor %eax,%eax
- ASM_CLAC
- RET
+ jmp copy_user_short_string
30: shll $6,%ecx
addl %ecx,%edx
- jmp 60f
-40: leal (%rdx,%rcx,8),%edx
- jmp 60f
-50: movl %ecx,%edx
-60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
+ jmp .Lcopy_user_handle_tail
_ASM_EXTABLE_CPY(1b, 30b)
_ASM_EXTABLE_CPY(2b, 30b)
@@ -127,10 +101,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
_ASM_EXTABLE_CPY(14b, 30b)
_ASM_EXTABLE_CPY(15b, 30b)
_ASM_EXTABLE_CPY(16b, 30b)
- _ASM_EXTABLE_CPY(18b, 40b)
- _ASM_EXTABLE_CPY(19b, 40b)
- _ASM_EXTABLE_CPY(21b, 50b)
- _ASM_EXTABLE_CPY(22b, 50b)
SYM_FUNC_END(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)
@@ -191,7 +161,7 @@ EXPORT_SYMBOL(copy_user_generic_string)
SYM_FUNC_START(copy_user_enhanced_fast_string)
ASM_STAC
/* CPUs without FSRM should avoid rep movsb for short copies */
- ALTERNATIVE "cmpl $64, %edx; jb .L_copy_short_string", "", X86_FEATURE_FSRM
+ ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
movl %edx,%ecx
1: rep movsb
xorl %eax,%eax
@@ -244,6 +214,53 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
SYM_CODE_END(.Lcopy_user_handle_tail)
/*
+ * Finish memcpy of less than 64 bytes. #AC should already be set.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count (< 64)
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+SYM_CODE_START_LOCAL(copy_user_short_string)
+ movl %edx,%ecx
+ andl $7,%edx
+ shrl $3,%ecx
+ jz .Lcopy_user_short_string_bytes
+18: movq (%rsi),%r8
+19: movq %r8,(%rdi)
+ leaq 8(%rsi),%rsi
+ leaq 8(%rdi),%rdi
+ decl %ecx
+ jnz 18b
+.Lcopy_user_short_string_bytes:
+ andl %edx,%edx
+ jz 23f
+ movl %edx,%ecx
+21: movb (%rsi),%al
+22: movb %al,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz 21b
+23: xor %eax,%eax
+ ASM_CLAC
+ RET
+
+40: leal (%rdx,%rcx,8),%edx
+ jmp 60f
+50: movl %ecx,%edx /* ecx is zerorest also */
+60: jmp .Lcopy_user_handle_tail
+
+ _ASM_EXTABLE_CPY(18b, 40b)
+ _ASM_EXTABLE_CPY(19b, 40b)
+ _ASM_EXTABLE_CPY(21b, 50b)
+ _ASM_EXTABLE_CPY(22b, 50b)
+SYM_CODE_END(copy_user_short_string)
+
+/*
* copy_user_nocache - Uncached memory copy with exception handling
* This will force destination out of cache for more performance.
*
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 65d15df6212d..0e65d00e2339 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -54,8 +54,8 @@ static void delay_loop(u64 __loops)
" jnz 2b \n"
"3: dec %0 \n"
- : /* we don't need output */
- :"a" (loops)
+ : "+a" (loops)
+ :
);
}
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index b781d324211b..21104c41cba0 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -342,9 +342,9 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
*/
static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
{
-#ifdef CONFIG_X86_64
unsigned short sel;
+#ifdef CONFIG_X86_64
switch (seg_reg_idx) {
case INAT_SEG_REG_IGNORE:
return 0;
@@ -402,7 +402,8 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
case INAT_SEG_REG_FS:
return (unsigned short)(regs->fs & 0xffff);
case INAT_SEG_REG_GS:
- return get_user_gs(regs);
+ savesegment(gs, sel);
+ return sel;
case INAT_SEG_REG_IGNORE:
default:
return -EINVAL;
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index 2b3eb8c948a3..a58f451a7dd3 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -11,7 +11,7 @@
#include <asm/msr.h>
#include <asm/archrandom.h>
#include <asm/e820/api.h>
-#include <asm/io.h>
+#include <asm/shared/io.h>
/*
* When built for the regular kernel, several functions need to be stubbed out
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/arch/x86/lib/mmx_32.c
+++ /dev/null
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index ecb2049c1273..b7dfd60243b7 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -48,6 +48,7 @@ SYM_FUNC_START(__put_user_1)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -62,6 +63,7 @@ SYM_FUNC_START(__put_user_2)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
2: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -76,6 +78,7 @@ SYM_FUNC_START(__put_user_4)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
3: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -90,6 +93,7 @@ SYM_FUNC_START(__put_user_8)
cmp %_ASM_BX,%_ASM_CX
jae .Lbad_put_user
SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
+ ENDBR
ASM_STAC
4: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 5f87bab4fb8d..b2b2366885a2 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -31,6 +31,7 @@
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
@@ -55,7 +56,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_START(__x86_indirect_thunk_array)
- ANNOTATE_NOENDBR // apply_retpolines
#define GEN(reg) THUNK reg
#include <asm/GEN-for-each-reg.h>
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0402a749f3a0..0ae6cf804197 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -119,7 +119,7 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
/* cache copy and flush to align dest */
if (!IS_ALIGNED(dest, 8)) {
- unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
+ size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest);
memcpy((void *) dest, (void *) source, len);
clean_cache_range((void *) dest, len);
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index b82ca14ba718..4a9fd9029a53 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -153,7 +153,7 @@ static long pm_address(u_char FPU_modrm, u_char segment,
switch (segment) {
case PREFIX_GS_ - 1:
/* user gs handling can be lazy, use special accessors */
- addr->selector = get_user_gs(FPU_info->regs);
+ savesegment(gs, addr->selector);
break;
default:
addr->selector = PM_REG_(segment);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fe3d3061fc11..d957dc15b371 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -20,13 +20,12 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
- pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
+ pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o
obj-y += pat/
# Make sure __phys_addr has no stackprotector
CFLAGS_physaddr.o := -fno-stack-protector
-CFLAGS_setup_nx.o := -fno-stack-protector
CFLAGS_mem_encrypt_identity.o := -fno-stack-protector
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 058b2f36b3a6..b3ca7d23e4b0 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -154,7 +154,7 @@ int __init amd_numa_init(void)
node_set(nodeid, numa_nodes_parsed);
}
- if (!nodes_weight(numa_nodes_parsed))
+ if (nodes_empty(numa_nodes_parsed))
return -ENOENT;
/*
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d0074c6ed31a..fad8faa29d04 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -149,7 +149,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char opcode;
if (user_mode(regs)) {
- if (get_user(opcode, instr))
+ if (get_user(opcode, (unsigned char __user *) instr))
break;
} else {
if (get_kernel_nofault(opcode, instr))
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 96d34ebb20a9..61d0ab154f96 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -110,7 +110,6 @@ int force_personality32;
/*
* noexec32=on|off
* Control non executable heap for 32bit processes.
- * To control the stack too use noexec=off
*
* on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
* off PROT_READ implies PROT_EXEC
@@ -902,6 +901,8 @@ static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end
static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
{
+ const unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
vmemmap_flush_unused_pmd();
/*
@@ -914,8 +915,7 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long
* Mark with PAGE_UNUSED the unused parts of the new memmap range
*/
if (!IS_ALIGNED(start, PMD_SIZE))
- memset((void *)start, PAGE_UNUSED,
- start - ALIGN_DOWN(start, PMD_SIZE));
+ memset((void *)page, PAGE_UNUSED, start - page);
/*
* We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 17a492c27306..1ad0228f8ceb 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -242,10 +242,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
* If the page being mapped is in memory and SEV is active then
* make sure the memory encryption attribute is enabled in the
* resulting mapping.
+ * In TDX guests, memory is marked private by default. If encryption
+ * is not requested (using encrypted), explicitly set decrypt
+ * attribute in all IOREMAPPED memory.
*/
prot = PAGE_KERNEL_IO;
if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted)
prot = pgprot_encrypted(prot);
+ else
+ prot = pgprot_decrypted(prot);
switch (pcm) {
case _PAGE_CACHE_MODE_UC:
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 50d209939c66..11350e2fd736 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -42,7 +42,14 @@ bool force_dma_unencrypted(struct device *dev)
static void print_mem_encrypt_feature_info(void)
{
- pr_info("AMD Memory Encryption Features active:");
+ pr_info("Memory Encryption Features active:");
+
+ if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ pr_cont(" Intel TDX\n");
+ return;
+ }
+
+ pr_cont(" AMD");
/* Secure Memory Encryption */
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
@@ -62,6 +69,10 @@ static void print_mem_encrypt_feature_info(void)
if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
pr_cont(" SEV-ES");
+ /* Secure Nested Paging */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ pr_cont(" SEV-SNP");
+
pr_cont("\n");
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 6169053c2854..d3c88d9ef8d6 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -31,6 +31,7 @@
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/cmdline.h>
+#include <asm/sev.h>
#include "mm_internal.h"
@@ -48,6 +49,36 @@ EXPORT_SYMBOL(sme_me_mask);
static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
/*
+ * SNP-specific routine which needs to additionally change the page state from
+ * private to shared before copying the data from the source to destination and
+ * restore after the copy.
+ */
+static inline void __init snp_memcpy(void *dst, void *src, size_t sz,
+ unsigned long paddr, bool decrypt)
+{
+ unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
+
+ if (decrypt) {
+ /*
+ * @paddr needs to be accessed decrypted, mark the page shared in
+ * the RMP table before copying it.
+ */
+ early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages);
+
+ memcpy(dst, src, sz);
+
+ /* Restore the page state after the memcpy. */
+ early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages);
+ } else {
+ /*
+ * @paddr need to be accessed encrypted, no need for the page state
+ * change.
+ */
+ memcpy(dst, src, sz);
+ }
+}
+
+/*
* This routine does not change the underlying encryption setting of the
* page(s) that map this memory. It assumes that eventually the memory is
* meant to be accessed as either encrypted or decrypted but the contents
@@ -95,8 +126,13 @@ static void __init __sme_early_enc_dec(resource_size_t paddr,
* Use a temporary buffer, of cache-line multiple size, to
* avoid data corruption as documented in the APM.
*/
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ snp_memcpy(sme_early_buffer, src, len, paddr, enc);
+ snp_memcpy(dst, sme_early_buffer, len, paddr, !enc);
+ } else {
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+ }
early_memunmap(dst, len);
early_memunmap(src, len);
@@ -280,11 +316,24 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
{
+ /*
+ * To maintain the security guarantees of SEV-SNP guests, make sure
+ * to invalidate the memory before encryption attribute is cleared.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
+ snp_set_memory_shared(vaddr, npages);
}
/* Return true unconditionally: return value doesn't matter for the SEV side */
static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc)
{
+ /*
+ * After memory is mapped encrypted in the page table, validate it
+ * so that it is consistent with the page table updates.
+ */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc)
+ snp_set_memory_private(vaddr, npages);
+
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages, enc);
@@ -322,14 +371,28 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
clflush_cache_range(__va(pa), size);
/* Encrypt/decrypt the contents in-place */
- if (enc)
+ if (enc) {
sme_early_encrypt(pa, size);
- else
+ } else {
sme_early_decrypt(pa, size);
+ /*
+ * ON SNP, the page state in the RMP table must happen
+ * before the page table updates.
+ */
+ early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1);
+ }
+
/* Change the page encryption mask. */
new_pte = pfn_pte(pfn, new_prot);
set_pte_atomic(kpte, new_pte);
+
+ /*
+ * If page is set encrypted in the page table, then update the RMP table to
+ * add this page as private.
+ */
+ if (enc)
+ early_snp_set_memory_private((unsigned long)__va(pa), pa, 1);
}
static int __init early_set_memory_enc_dec(unsigned long vaddr,
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index b43bc24d2bb6..f415498d3175 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -45,6 +45,7 @@
#include <asm/sections.h>
#include <asm/cmdline.h>
#include <asm/coco.h>
+#include <asm/sev.h>
#include "mm_internal.h"
@@ -509,8 +510,11 @@ void __init sme_enable(struct boot_params *bp)
bool active_by_default;
unsigned long me_mask;
char buffer[16];
+ bool snp;
u64 msr;
+ snp = snp_init(bp);
+
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
@@ -542,6 +546,10 @@ void __init sme_enable(struct boot_params *bp)
sev_status = __rdmsr(MSR_AMD64_SEV);
feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+ /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
+ if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ snp_abort();
+
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
/*
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 933a2ebad471..c3317f0650d8 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -400,7 +400,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
- if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
+ if (!cpumask_available(downed_cpus) || cpumask_empty(downed_cpus))
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 1a02b791d273..9a9305367fdd 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -123,7 +123,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
- while (nodes_weight(physnode_mask)) {
+ while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
@@ -270,7 +270,7 @@ static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
*/
- while (nodes_weight(physnode_mask)) {
+ while (!nodes_empty(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 4ba2a3ee4bce..d5ef64ddd35e 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -101,7 +101,7 @@ int pat_debug_enable;
static int __init pat_debug_setup(char *str)
{
pat_debug_enable = 1;
- return 0;
+ return 1;
}
__setup("debugpat", pat_debug_setup);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index abf5ed76e4b7..0656db33574d 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -638,17 +638,6 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
-/*
- * Lookup the page table entry for a virtual address in a given mm. Return a
- * pointer to the entry and the level of the mapping.
- */
-pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
- unsigned int *level)
-{
- return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
-}
-EXPORT_SYMBOL_GPL(lookup_address_in_mm);
-
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 5d5c7bb50ce9..ffe3b3a087fe 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -540,7 +540,7 @@ static inline bool pti_kernel_image_global_ok(void)
* cases where RANDSTRUCT is in use to help keep the layout a
* secret.
*/
- if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ if (IS_ENABLED(CONFIG_RANDSTRUCT))
return false;
return true;
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
deleted file mode 100644
index ed5667f5169f..000000000000
--- a/arch/x86/mm/setup_nx.c
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/pgtable.h>
-
-#include <asm/proto.h>
-#include <asm/cpufeature.h>
-
-static int disable_nx;
-
-/*
- * noexec = on|off
- *
- * Control non-executable mappings for processes.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- }
- x86_configure_nx();
- return 0;
-}
-early_param("noexec", noexec_setup);
-
-void x86_configure_nx(void)
-{
- if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
- __supported_pte_mask |= _PAGE_NX;
- else
- __supported_pte_mask &= ~_PAGE_NX;
-}
-
-void __init x86_report_nx(void)
-{
- if (!boot_cpu_has(X86_FEATURE_NX)) {
- printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU!\n");
- } else {
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
- if (disable_nx) {
- printk(KERN_INFO "NX (Execute Disable) protection: "
- "disabled by kernel command line option\n");
- } else {
- printk(KERN_INFO "NX (Execute Disable) protection: "
- "active\n");
- }
-#else
- /* 32bit non-PAE kernel, NX cannot be used */
- printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "cannot be enabled: non-PAE kernel!\n");
-#endif
- }
-}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb4d91d5365..d400b6d9d246 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -855,13 +855,11 @@ done:
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu)
+static bool tlb_is_not_lazy(int cpu, void *data)
{
return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
-static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
-
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
@@ -890,36 +888,11 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables) {
+ if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
- } else {
- /*
- * Although we could have used on_each_cpu_cond_mask(),
- * open-coding it has performance advantages, as it eliminates
- * the need for indirect calls or retpolines. In addition, it
- * allows to use a designated cpumask for evaluating the
- * condition, instead of allocating one.
- *
- * This code works under the assumption that there are no nested
- * TLB flushes, an assumption that is already made in
- * flush_tlb_mm_range().
- *
- * cond_cpumask is logically a stack-local variable, but it is
- * more efficient to have it off the stack and not to allocate
- * it on demand. Preemption is disabled and this code is
- * non-reentrant.
- */
- struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
- int cpu;
-
- cpumask_clear(cond_cpumask);
-
- for_each_cpu(cpu, cpumask) {
- if (tlb_is_not_lazy(cpu))
- __cpumask_set_cpu(cpu, cond_cpumask);
- }
- on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
- }
+ else
+ on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ (void *)info, 1, cpumask);
}
void flush_tlb_multi(const struct cpumask *cpumask,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8fe35ed11fd6..16b6efacf7c6 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -412,6 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
EMIT_LFENCE();
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
+ OPTIMIZER_HIDE_VAR(reg);
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else
#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 97b63e35e152..a498b847d740 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -25,6 +25,8 @@
#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
#define PIRQ_VERSION 0x0100
+#define IRT_SIGNATURE (('$' << 0) + ('I' << 8) + ('R' << 16) + ('T' << 24))
+
static int broken_hp_bios_irq9;
static int acer_tm360_irqrouting;
@@ -68,30 +70,99 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
* and perform checksum verification.
*/
-static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr,
+ u8 *limit)
{
struct irq_routing_table *rt;
int i;
u8 sum;
- rt = (struct irq_routing_table *) addr;
+ rt = (struct irq_routing_table *)addr;
if (rt->signature != PIRQ_SIGNATURE ||
rt->version != PIRQ_VERSION ||
rt->size % 16 ||
- rt->size < sizeof(struct irq_routing_table))
+ rt->size < sizeof(struct irq_routing_table) ||
+ (limit && rt->size > limit - addr))
return NULL;
sum = 0;
for (i = 0; i < rt->size; i++)
sum += addr[i];
if (!sum) {
- DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
- rt);
+ DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%lx\n",
+ __pa(rt));
return rt;
}
return NULL;
}
+/*
+ * Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP
+ * (BIOS Configuration Program) external tool meant for tweaking BIOS
+ * structures without the need to rebuild it from sources. The $IRT
+ * format has been invented by AMI before Microsoft has come up with its
+ * $PIR format and a $IRT table is therefore there in some systems that
+ * lack a $PIR table.
+ *
+ * It uses the same PCI BIOS 2.1 format for interrupt routing entries
+ * themselves but has a different simpler header prepended instead,
+ * occupying 8 bytes, where a `$IRT' signature is followed by one byte
+ * specifying the total number of interrupt routing entries allocated in
+ * the table, then one byte specifying the actual number of entries used
+ * (which the BCP tool can take advantage of when modifying the table),
+ * and finally a 16-bit word giving the IRQs devoted exclusively to PCI.
+ * Unlike with the $PIR table there is no alignment guarantee.
+ *
+ * Given the similarity of the two formats the $IRT one is trivial to
+ * convert to the $PIR one, which we do here, except that obviously we
+ * have no information as to the router device to use, but we can handle
+ * it by matching PCI device IDs actually seen on the bus against ones
+ * that our individual routers recognise.
+ *
+ * Reportedly there is another $IRT table format where a 16-bit word
+ * follows the header instead that points to interrupt routing entries
+ * in a $PIR table provided elsewhere. In that case this code will not
+ * be reached though as the $PIR table will have been chosen instead.
+ */
+static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr,
+ u8 *limit)
+{
+ struct irt_routing_table *ir;
+ struct irq_routing_table *rt;
+ u16 size;
+ u8 sum;
+ int i;
+
+ ir = (struct irt_routing_table *)addr;
+ if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used)
+ return NULL;
+
+ size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]);
+ if (size > limit - addr)
+ return NULL;
+
+ DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n",
+ __pa(ir));
+ size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]);
+ rt = kzalloc(size, GFP_KERNEL);
+ if (!rt)
+ return NULL;
+
+ rt->signature = PIRQ_SIGNATURE;
+ rt->version = PIRQ_VERSION;
+ rt->size = size;
+ rt->exclusive_irqs = ir->exclusive_irqs;
+ for (i = 0; i < ir->used; i++)
+ rt->slots[i] = ir->slots[i];
+
+ addr = (u8 *)rt;
+ sum = 0;
+ for (i = 0; i < size; i++)
+ sum += addr[i];
+ rt->checksum = -sum;
+
+ return rt;
+}
/*
* Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
@@ -99,17 +170,29 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
static struct irq_routing_table * __init pirq_find_routing_table(void)
{
+ u8 * const bios_start = (u8 *)__va(0xf0000);
+ u8 * const bios_end = (u8 *)__va(0x100000);
u8 *addr;
struct irq_routing_table *rt;
if (pirq_table_addr) {
- rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+ rt = pirq_check_routing_table((u8 *)__va(pirq_table_addr),
+ NULL);
if (rt)
return rt;
printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
}
- for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
- rt = pirq_check_routing_table(addr);
+ for (addr = bios_start;
+ addr < bios_end - sizeof(struct irq_routing_table);
+ addr += 16) {
+ rt = pirq_check_routing_table(addr, bios_end);
+ if (rt)
+ return rt;
+ }
+ for (addr = bios_start;
+ addr < bios_end - sizeof(struct irt_routing_table);
+ addr++) {
+ rt = pirq_convert_irt_table(addr, bios_end);
if (rt)
return rt;
}
@@ -135,7 +218,8 @@ static void __init pirq_peer_trick(void)
#ifdef DEBUG
{
int j;
- DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+ DBG(KERN_DEBUG "%02x:%02x.%x slot=%02x",
+ e->bus, e->devfn / 8, e->devfn % 8, e->slot);
for (j = 0; j < 4; j++)
DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
DBG("\n");
@@ -253,6 +337,15 @@ static void write_pc_conf_nybble(u8 base, u8 index, u8 val)
pc_conf_set(reg, x);
}
+/*
+ * FinALi pirq rules are as follows:
+ *
+ * - bit 0 selects between INTx Routing Table Mapping Registers,
+ *
+ * - bit 3 selects the nibble within the INTx Routing Table Mapping Register,
+ *
+ * - bits 7:4 map to bits 3:0 of the PCI INTx Sensitivity Register.
+ */
static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev,
int pirq)
{
@@ -260,11 +353,13 @@ static int pirq_finali_get(struct pci_dev *router, struct pci_dev *dev,
0, 9, 3, 10, 4, 5, 7, 6, 0, 11, 0, 12, 0, 14, 0, 15
};
unsigned long flags;
+ u8 index;
u8 x;
+ index = (pirq & 1) << 1 | (pirq & 8) >> 3;
raw_spin_lock_irqsave(&pc_conf_lock, flags);
pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY);
- x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1)];
+ x = irqmap[read_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index)];
pc_conf_set(PC_CONF_FINALI_LOCK, 0);
raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
return x;
@@ -278,13 +373,15 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev,
};
u8 val = irqmap[irq];
unsigned long flags;
+ u8 index;
if (!val)
return 0;
+ index = (pirq & 1) << 1 | (pirq & 8) >> 3;
raw_spin_lock_irqsave(&pc_conf_lock, flags);
pc_conf_set(PC_CONF_FINALI_LOCK, PC_CONF_FINALI_LOCK_KEY);
- write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, pirq - 1, val);
+ write_pc_conf_nybble(PC_CONF_FINALI_PCI_INTX_RT1, index, val);
pc_conf_set(PC_CONF_FINALI_LOCK, 0);
raw_spin_unlock_irqrestore(&pc_conf_lock, flags);
return 1;
@@ -293,7 +390,7 @@ static int pirq_finali_set(struct pci_dev *router, struct pci_dev *dev,
static int pirq_finali_lvl(struct pci_dev *router, struct pci_dev *dev,
int pirq, int irq)
{
- u8 mask = ~(1u << (pirq - 1));
+ u8 mask = ~((pirq & 0xf0u) >> 4);
unsigned long flags;
u8 trig;
@@ -579,6 +676,81 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
return 1;
}
+
+/*
+ * PIRQ routing for the SiS85C497 AT Bus Controller & Megacell (ATM)
+ * ISA bridge used with the SiS 85C496/497 486 Green PC VESA/ISA/PCI
+ * Chipset.
+ *
+ * There are four PCI INTx#-to-IRQ Link registers provided in the
+ * SiS85C497 part of the peculiar combined 85C496/497 configuration
+ * space decoded by the SiS85C496 PCI & CPU Memory Controller (PCM)
+ * host bridge, at 0xc0/0xc1/0xc2/0xc3 respectively for the PCI INT
+ * A/B/C/D lines. Bit 7 enables the respective link if set and bits
+ * 3:0 select the 8259A IRQ line as follows:
+ *
+ * 0000 : Reserved
+ * 0001 : Reserved
+ * 0010 : Reserved
+ * 0011 : IRQ3
+ * 0100 : IRQ4
+ * 0101 : IRQ5
+ * 0110 : IRQ6
+ * 0111 : IRQ7
+ * 1000 : Reserved
+ * 1001 : IRQ9
+ * 1010 : IRQ10
+ * 1011 : IRQ11
+ * 1100 : IRQ12
+ * 1101 : Reserved
+ * 1110 : IRQ14
+ * 1111 : IRQ15
+ *
+ * We avoid using a reserved value for disabled links, hence the
+ * choice of IRQ15 for that case.
+ *
+ * References:
+ *
+ * "486 Green PC VESA/ISA/PCI Chipset, SiS 85C496/497", Rev 3.0,
+ * Silicon Integrated Systems Corp., July 1995
+ */
+
+#define PCI_SIS497_INTA_TO_IRQ_LINK 0xc0u
+
+#define PIRQ_SIS497_IRQ_MASK 0x0fu
+#define PIRQ_SIS497_IRQ_ENABLE 0x80u
+
+static int pirq_sis497_get(struct pci_dev *router, struct pci_dev *dev,
+ int pirq)
+{
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1;
+
+ pci_read_config_byte(router, reg, &x);
+ return (x & PIRQ_SIS497_IRQ_ENABLE) ? (x & PIRQ_SIS497_IRQ_MASK) : 0;
+}
+
+static int pirq_sis497_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
+{
+ int reg;
+ u8 x;
+
+ reg = pirq;
+ if (reg >= 1 && reg <= 4)
+ reg += PCI_SIS497_INTA_TO_IRQ_LINK - 1;
+
+ pci_read_config_byte(router, reg, &x);
+ x &= ~(PIRQ_SIS497_IRQ_MASK | PIRQ_SIS497_IRQ_ENABLE);
+ x |= irq ? (PIRQ_SIS497_IRQ_ENABLE | irq) : PIRQ_SIS497_IRQ_MASK;
+ pci_write_config_byte(router, reg, x);
+ return 1;
+}
+
/*
* PIRQ routing for SiS 85C503 router used in several SiS chipsets.
* We have to deal with the following issues here:
@@ -640,11 +812,12 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
* bit 6-4 are probably unused, not like 5595
*/
-#define PIRQ_SIS_IRQ_MASK 0x0f
-#define PIRQ_SIS_IRQ_DISABLE 0x80
-#define PIRQ_SIS_USB_ENABLE 0x40
+#define PIRQ_SIS503_IRQ_MASK 0x0f
+#define PIRQ_SIS503_IRQ_DISABLE 0x80
+#define PIRQ_SIS503_USB_ENABLE 0x40
-static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+static int pirq_sis503_get(struct pci_dev *router, struct pci_dev *dev,
+ int pirq)
{
u8 x;
int reg;
@@ -653,10 +826,11 @@ static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
if (reg >= 0x01 && reg <= 0x04)
reg += 0x40;
pci_read_config_byte(router, reg, &x);
- return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+ return (x & PIRQ_SIS503_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS503_IRQ_MASK);
}
-static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int pirq_sis503_set(struct pci_dev *router, struct pci_dev *dev,
+ int pirq, int irq)
{
u8 x;
int reg;
@@ -665,8 +839,8 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
if (reg >= 0x01 && reg <= 0x04)
reg += 0x40;
pci_read_config_byte(router, reg, &x);
- x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
- x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+ x &= ~(PIRQ_SIS503_IRQ_MASK | PIRQ_SIS503_IRQ_DISABLE);
+ x |= irq ? irq : PIRQ_SIS503_IRQ_DISABLE;
pci_write_config_byte(router, reg, x);
return 1;
}
@@ -958,13 +1132,19 @@ static __init int serverworks_router_probe(struct irq_router *r,
static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
{
- if (device != PCI_DEVICE_ID_SI_503)
- return 0;
-
- r->name = "SIS";
- r->get = pirq_sis_get;
- r->set = pirq_sis_set;
- return 1;
+ switch (device) {
+ case PCI_DEVICE_ID_SI_496:
+ r->name = "SiS85C497";
+ r->get = pirq_sis497_get;
+ r->set = pirq_sis497_set;
+ return 1;
+ case PCI_DEVICE_ID_SI_503:
+ r->name = "SiS85C503";
+ r->get = pirq_sis503_get;
+ r->set = pirq_sis503_set;
+ return 1;
+ }
+ return 0;
}
static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
@@ -1084,10 +1264,32 @@ static struct pci_dev *pirq_router_dev;
* chipset" ?
*/
+static bool __init pirq_try_router(struct irq_router *r,
+ struct irq_routing_table *rt,
+ struct pci_dev *dev)
+{
+ struct irq_router_handler *h;
+
+ DBG(KERN_DEBUG "PCI: Trying IRQ router for [%04x:%04x]\n",
+ dev->vendor, dev->device);
+
+ for (h = pirq_routers; h->vendor; h++) {
+ /* First look for a router match */
+ if (rt->rtr_vendor == h->vendor &&
+ h->probe(r, dev, rt->rtr_device))
+ return true;
+ /* Fall back to a device match */
+ if (dev->vendor == h->vendor &&
+ h->probe(r, dev, dev->device))
+ return true;
+ }
+ return false;
+}
+
static void __init pirq_find_router(struct irq_router *r)
{
struct irq_routing_table *rt = pirq_table;
- struct irq_router_handler *h;
+ struct pci_dev *dev;
#ifdef CONFIG_PCI_BIOS
if (!rt->signature) {
@@ -1106,50 +1308,94 @@ static void __init pirq_find_router(struct irq_router *r)
DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
rt->rtr_vendor, rt->rtr_device);
- pirq_router_dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
- rt->rtr_devfn);
- if (!pirq_router_dev) {
- DBG(KERN_DEBUG "PCI: Interrupt router not found at "
- "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
- return;
+ /* Use any vendor:device provided by the routing table or try all. */
+ if (rt->rtr_vendor) {
+ dev = pci_get_domain_bus_and_slot(0, rt->rtr_bus,
+ rt->rtr_devfn);
+ if (dev && pirq_try_router(r, rt, dev))
+ pirq_router_dev = dev;
+ } else {
+ dev = NULL;
+ for_each_pci_dev(dev) {
+ if (pirq_try_router(r, rt, dev)) {
+ pirq_router_dev = dev;
+ break;
+ }
+ }
}
- for (h = pirq_routers; h->vendor; h++) {
- /* First look for a router match */
- if (rt->rtr_vendor == h->vendor &&
- h->probe(r, pirq_router_dev, rt->rtr_device))
- break;
- /* Fall back to a device match */
- if (pirq_router_dev->vendor == h->vendor &&
- h->probe(r, pirq_router_dev, pirq_router_dev->device))
- break;
- }
- dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
- pirq_router.name,
- pirq_router_dev->vendor, pirq_router_dev->device);
+ if (pirq_router_dev)
+ dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
+ pirq_router.name,
+ pirq_router_dev->vendor, pirq_router_dev->device);
+ else
+ DBG(KERN_DEBUG "PCI: Interrupt router not found at "
+ "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
/* The device remains referenced for the kernel lifetime */
}
-static struct irq_info *pirq_get_info(struct pci_dev *dev)
+/*
+ * We're supposed to match on the PCI device only and not the function,
+ * but some BIOSes build their tables with the PCI function included
+ * for motherboard devices, so if a complete match is found, then give
+ * it precedence over a slot match.
+ */
+static struct irq_info *pirq_get_dev_info(struct pci_dev *dev)
{
struct irq_routing_table *rt = pirq_table;
int entries = (rt->size - sizeof(struct irq_routing_table)) /
sizeof(struct irq_info);
+ struct irq_info *slotinfo = NULL;
struct irq_info *info;
for (info = rt->slots; entries--; info++)
- if (info->bus == dev->bus->number &&
- PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
- return info;
- return NULL;
+ if (info->bus == dev->bus->number) {
+ if (info->devfn == dev->devfn)
+ return info;
+ if (!slotinfo &&
+ PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+ slotinfo = info;
+ }
+ return slotinfo;
+}
+
+/*
+ * Buses behind bridges are typically not listed in the PIRQ routing table.
+ * Do the usual dance then and walk the tree of bridges up adjusting the
+ * pin number accordingly on the way until the originating root bus device
+ * has been reached and then use its routing information.
+ */
+static struct irq_info *pirq_get_info(struct pci_dev *dev, u8 *pin)
+{
+ struct pci_dev *temp_dev = dev;
+ struct irq_info *info;
+ u8 temp_pin = *pin;
+ u8 dpin = temp_pin;
+
+ info = pirq_get_dev_info(dev);
+ while (!info && temp_dev->bus->parent) {
+ struct pci_dev *bridge = temp_dev->bus->self;
+
+ temp_pin = pci_swizzle_interrupt_pin(temp_dev, temp_pin);
+ info = pirq_get_dev_info(bridge);
+ if (info)
+ dev_warn(&dev->dev,
+ "using bridge %s INT %c to get INT %c\n",
+ pci_name(bridge),
+ 'A' + temp_pin - 1, 'A' + dpin - 1);
+
+ temp_dev = bridge;
+ }
+ *pin = temp_pin;
+ return info;
}
static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
{
- u8 pin;
struct irq_info *info;
int i, pirq, newirq;
+ u8 dpin, pin;
int irq = 0;
u32 mask;
struct irq_router *r = &pirq_router;
@@ -1157,8 +1403,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
char *msg = NULL;
/* Find IRQ pin */
- pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
- if (!pin) {
+ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &dpin);
+ if (!dpin) {
dev_dbg(&dev->dev, "no interrupt pin\n");
return 0;
}
@@ -1171,20 +1417,21 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
if (!pirq_table)
return 0;
- info = pirq_get_info(dev);
+ pin = dpin;
+ info = pirq_get_info(dev, &pin);
if (!info) {
dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
- 'A' + pin - 1);
+ 'A' + dpin - 1);
return 0;
}
pirq = info->irq[pin - 1].link;
mask = info->irq[pin - 1].bitmap;
if (!pirq) {
- dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin - 1);
+ dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + dpin - 1);
return 0;
}
dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
- 'A' + pin - 1, pirq, mask, pirq_table->exclusive_irqs);
+ 'A' + dpin - 1, pirq, mask, pirq_table->exclusive_irqs);
mask &= pcibios_irq_mask;
/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -1226,7 +1473,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
newirq = i;
}
}
- dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin - 1, newirq);
+ dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + dpin - 1, newirq);
/* Check if it is hardcoded */
if ((pirq & 0xf0) == 0xf0) {
@@ -1260,15 +1507,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 0;
}
}
- dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq);
+ dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n",
+ msg, 'A' + dpin - 1, irq);
/* Update IRQ for all devices with the same pirq value */
for_each_pci_dev(dev2) {
- pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
- if (!pin)
+ pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &dpin);
+ if (!dpin)
continue;
- info = pirq_get_info(dev2);
+ pin = dpin;
+ info = pirq_get_info(dev2, &pin);
if (!info)
continue;
if (info->irq[pin - 1].link == pirq) {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 9bb1e2941179..b94f727251b6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -467,7 +467,6 @@ static __init void xen_setup_pci_msi(void)
else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
- pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
xen_msi_ops.setup_msi_irqs = xen_hvm_setup_msi_irqs;
xen_msi_ops.teardown_msi_irqs = xen_teardown_msi_irqs;
@@ -481,6 +480,11 @@ static __init void xen_setup_pci_msi(void)
* in allocating the native domain and never use it.
*/
x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain;
+ /*
+ * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely
+ * controlled by the hypervisor.
+ */
+ pci_msi_ignore_mask = 1;
}
#else /* CONFIG_PCI_MSI */
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 147c30a81f15..1591d67e0bcd 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -93,6 +93,9 @@ static const unsigned long * const efi_tables[] = {
#ifdef CONFIG_LOAD_UEFI_KEYS
&efi.mokvar_table,
#endif
+#ifdef CONFIG_EFI_COCO_SECRET
+ &efi.coco_secret,
+#endif
};
u64 efi_setup; /* efi setup_data physical address */
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 72c1e42d121d..7fe564eaf228 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,6 +50,7 @@
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
+ UNWIND_HINT_EMPTY
cld
lgdt (_pa(gdt))
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 1e9ff28bc2e0..a60af0230e27 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -244,8 +244,10 @@ static inline bool uv_nmi_action_is(const char *action)
/* Setup which NMI support is present in system */
static void uv_nmi_setup_mmrs(void)
{
+ bool new_nmi_method_only = false;
+
/* First determine arch specific MMRs to handshake with BIOS */
- if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) {
+ if (UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK) { /* UV2,3,4 setup */
uvh_nmi_mmrx = UVH_EVENT_OCCURRED0;
uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED0_ALIAS;
uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT;
@@ -255,26 +257,25 @@ static void uv_nmi_setup_mmrs(void)
uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2;
uvh_nmi_mmrx_req_shift = 62;
- } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) {
+ } else if (UVH_EVENT_OCCURRED1_EXTIO_INT0_MASK) { /* UV5+ setup */
uvh_nmi_mmrx = UVH_EVENT_OCCURRED1;
uvh_nmi_mmrx_clear = UVH_EVENT_OCCURRED1_ALIAS;
uvh_nmi_mmrx_shift = UVH_EVENT_OCCURRED1_EXTIO_INT0_SHFT;
uvh_nmi_mmrx_type = "OCRD1-EXTIO_INT0";
- uvh_nmi_mmrx_supported = UVH_EXTIO_INT0_BROADCAST;
- uvh_nmi_mmrx_req = UVH_BIOS_KERNEL_MMR_ALIAS_2;
- uvh_nmi_mmrx_req_shift = 62;
+ new_nmi_method_only = true; /* Newer nmi always valid on UV5+ */
+ uvh_nmi_mmrx_req = 0; /* no request bit to clear */
} else {
- pr_err("UV:%s:cannot find EVENT_OCCURRED*_EXTIO_INT0\n",
- __func__);
+ pr_err("UV:%s:NMI support not available on this system\n", __func__);
return;
}
/* Then find out if new NMI is supported */
- if (likely(uv_read_local_mmr(uvh_nmi_mmrx_supported))) {
- uv_write_local_mmr(uvh_nmi_mmrx_req,
- 1UL << uvh_nmi_mmrx_req_shift);
+ if (new_nmi_method_only || uv_read_local_mmr(uvh_nmi_mmrx_supported)) {
+ if (uvh_nmi_mmrx_req)
+ uv_write_local_mmr(uvh_nmi_mmrx_req,
+ 1UL << uvh_nmi_mmrx_req_shift);
nmi_mmr = uvh_nmi_mmrx;
nmi_mmr_clear = uvh_nmi_mmrx_clear;
nmi_mmr_pending = 1UL << uvh_nmi_mmrx_shift;
@@ -985,7 +986,7 @@ static int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
/* Clear global flags */
if (master) {
- if (cpumask_weight(uv_nmi_cpu_mask))
+ if (!cpumask_empty(uv_nmi_cpu_mask))
uv_nmi_cleanup_mask();
atomic_set(&uv_nmi_cpus_in_nmi, -1);
atomic_set(&uv_nmi_cpu, -1);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 9f2b251e83c5..bb176c72891c 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -25,6 +25,7 @@
#include <asm/cpu.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
+#include <asm/microcode.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -40,7 +41,8 @@ static void msr_save_context(struct saved_context *ctxt)
struct saved_msr *end = msr + ctxt->saved_msrs.num;
while (msr < end) {
- msr->valid = !rdmsrl_safe(msr->info.msr_no, &msr->info.reg.q);
+ if (msr->valid)
+ rdmsrl(msr->info.msr_no, msr->info.reg.q);
msr++;
}
}
@@ -261,11 +263,18 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
- msr_restore_context(ctxt);
c = &cpu_data(smp_processor_id());
if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL))
init_ia32_feat_ctl(c);
+
+ microcode_bsp_resume();
+
+ /*
+ * This needs to happen after the microcode has been updated upon resume
+ * because some of the MSRs are "emulated" in microcode.
+ */
+ msr_restore_context(ctxt);
}
/* Needed by apm.c */
@@ -424,8 +433,10 @@ static int msr_build_context(const u32 *msr_id, const int num)
}
for (i = saved_msrs->num, j = 0; i < total_num; i++, j++) {
+ u64 dummy;
+
msr_array[i].info.msr_no = msr_id[j];
- msr_array[i].valid = false;
+ msr_array[i].valid = !rdmsrl_safe(msr_id[j], &dummy);
msr_array[i].info.reg.q = 0;
}
saved_msrs->num = total_num;
@@ -500,10 +511,24 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
return ret;
}
+static void pm_save_spec_msr(void)
+{
+ u32 spec_msr_id[] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_TSX_CTRL,
+ MSR_TSX_FORCE_ABORT,
+ MSR_IA32_MCU_OPT_CTRL,
+ MSR_AMD64_LS_CFG,
+ };
+
+ msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+}
+
static int pm_check_save_msr(void)
{
dmi_check_system(msr_save_dmi_table);
pm_cpu_check(msr_save_cpu_table);
+ pm_save_spec_msr();
return 0;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index c5e29db02a46..41d7669a97ad 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -67,7 +67,7 @@ void __init reserve_real_mode(void)
memblock_reserve(0, SZ_1M);
}
-static void sme_sev_setup_real_mode(struct trampoline_header *th)
+static void __init sme_sev_setup_real_mode(struct trampoline_header *th)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index 8c1db5bf5d78..2eb62be6d256 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header)
.long pa_sev_es_trampoline_start
#endif
#ifdef CONFIG_X86_64
+ .long pa_trampoline_start64
.long pa_trampoline_pgd;
#endif
/* ACPI S3 wakeup */
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index cc8391f86cdb..e38d61d6562e 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -70,7 +70,7 @@ SYM_CODE_START(trampoline_start)
movw $__KERNEL_DS, %dx # Data segment descriptor
# Enable protected mode
- movl $X86_CR0_PE, %eax # protected mode (PE) bit
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
movl %eax, %cr0 # into protected mode
# flush prefetch and jump to startup_32
@@ -143,13 +143,24 @@ SYM_CODE_START(startup_32)
movl %eax, %cr3
# Set up EFER
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /*
+ * Skip writing to EFER if the register already has desired
+ * value (to avoid #VE for the TDX guest).
+ */
+ cmp pa_tr_efer, %eax
+ jne .Lwrite_efer
+ cmp pa_tr_efer + 4, %edx
+ je .Ldone_efer
+.Lwrite_efer:
movl pa_tr_efer, %eax
movl pa_tr_efer + 4, %edx
- movl $MSR_EFER, %ecx
wrmsr
- # Enable paging and in turn activate Long Mode
- movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax
+.Ldone_efer:
+ # Enable paging and in turn activate Long Mode.
+ movl $CR0_STATE, %eax
movl %eax, %cr0
/*
@@ -161,6 +172,19 @@ SYM_CODE_START(startup_32)
ljmpl $__KERNEL_CS, $pa_startup_64
SYM_CODE_END(startup_32)
+SYM_CODE_START(pa_trampoline_compat)
+ /*
+ * In compatibility mode. Prep ESP and DX for startup_32, then disable
+ * paging and complete the switch to legacy 32-bit mode.
+ */
+ movl $rm_stack_end, %esp
+ movw $__KERNEL_DS, %dx
+
+ movl $(CR0_STATE & ~X86_CR0_PG), %eax
+ movl %eax, %cr0
+ ljmpl $__KERNEL32_CS, $pa_startup_32
+SYM_CODE_END(pa_trampoline_compat)
+
.section ".text64","ax"
.code64
.balign 4
@@ -169,6 +193,20 @@ SYM_CODE_START(startup_64)
jmpq *tr_start(%rip)
SYM_CODE_END(startup_64)
+SYM_CODE_START(trampoline_start64)
+ /*
+ * APs start here on a direct transfer from 64-bit BIOS with identity
+ * mapped page tables. Load the kernel's GDT in order to gear down to
+ * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load
+ * segment registers. Load the zero IDT so any fault triggers a
+ * shutdown instead of jumping back into BIOS.
+ */
+ lidt tr_idt(%rip)
+ lgdt tr_gdt64(%rip)
+
+ ljmpl *tr_compat(%rip)
+SYM_CODE_END(trampoline_start64)
+
.section ".rodata","a"
# Duplicate the global descriptor table
# so the kernel can live anywhere
@@ -182,6 +220,17 @@ SYM_DATA_START(tr_gdt)
.quad 0x00cf93000000ffff # __KERNEL_DS
SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end)
+SYM_DATA_START(tr_gdt64)
+ .short tr_gdt_end - tr_gdt - 1 # gdt limit
+ .long pa_tr_gdt
+ .long 0
+SYM_DATA_END(tr_gdt64)
+
+SYM_DATA_START(tr_compat)
+ .long pa_trampoline_compat
+ .short __KERNEL32_CS
+SYM_DATA_END(tr_compat)
+
.bss
.balign PAGE_SIZE
SYM_DATA(trampoline_pgd, .space PAGE_SIZE)
diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S
index 5033e640f957..4331c32c47f8 100644
--- a/arch/x86/realmode/rm/trampoline_common.S
+++ b/arch/x86/realmode/rm/trampoline_common.S
@@ -1,4 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
.section ".rodata","a"
.balign 16
-SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0)
+
+/*
+ * When a bootloader hands off to the kernel in 32-bit mode an
+ * IDT with a 2-byte limit and 4-byte base is needed. When a boot
+ * loader hands off to a kernel 64-bit mode the base address
+ * extends to 8-bytes. Reserve enough space for either scenario.
+ */
+SYM_DATA_START_LOCAL(tr_idt)
+ .short 0
+ .quad 0
+SYM_DATA_END(tr_idt)
diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c
index 1d6437e6d2ba..a6f4d8388ad8 100644
--- a/arch/x86/realmode/rm/wakemain.c
+++ b/arch/x86/realmode/rm/wakemain.c
@@ -62,8 +62,12 @@ static void send_morse(const char *pattern)
}
}
+struct port_io_ops pio_ops;
+
void main(void)
{
+ init_default_io_ops();
+
/* Kill machine if structures are wrong */
if (wakeup_header.real_magic != 0x12345678)
while (1)
diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S
new file mode 100644
index 000000000000..49a54356ae99
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/tdxcall.S
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/asm-offsets.h>
+#include <asm/tdx.h>
+
+/*
+ * TDCALL and SEAMCALL are supported in Binutils >= 2.36.
+ */
+#define tdcall .byte 0x66,0x0f,0x01,0xcc
+#define seamcall .byte 0x66,0x0f,0x01,0xcf
+
+/*
+ * TDX_MODULE_CALL - common helper macro for both
+ * TDCALL and SEAMCALL instructions.
+ *
+ * TDCALL - used by TDX guests to make requests to the
+ * TDX module and hypercalls to the VMM.
+ * SEAMCALL - used by TDX hosts to make requests to the
+ * TDX module.
+ */
+.macro TDX_MODULE_CALL host:req
+ /*
+ * R12 will be used as temporary storage for struct tdx_module_output
+ * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL
+ * services supported by this function, it can be reused.
+ */
+
+ /* Callee saved, so preserve it */
+ push %r12
+
+ /*
+ * Push output pointer to stack.
+ * After the operation, it will be fetched into R12 register.
+ */
+ push %r9
+
+ /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */
+ /* Move Leaf ID to RAX */
+ mov %rdi, %rax
+ /* Move input 4 to R9 */
+ mov %r8, %r9
+ /* Move input 3 to R8 */
+ mov %rcx, %r8
+ /* Move input 1 to RCX */
+ mov %rsi, %rcx
+ /* Leave input param 2 in RDX */
+
+ .if \host
+ seamcall
+ /*
+ * SEAMCALL instruction is essentially a VMExit from VMX root
+ * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates
+ * that the targeted SEAM firmware is not loaded or disabled,
+ * or P-SEAMLDR is busy with another SEAMCALL. %rax is not
+ * changed in this case.
+ *
+ * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid.
+ * This value will never be used as actual SEAMCALL error code as
+ * it is from the Reserved status code class.
+ */
+ jnc .Lno_vmfailinvalid
+ mov $TDX_SEAMCALL_VMFAILINVALID, %rax
+.Lno_vmfailinvalid:
+
+ .else
+ tdcall
+ .endif
+
+ /*
+ * Fetch output pointer from stack to R12 (It is used
+ * as temporary storage)
+ */
+ pop %r12
+
+ /*
+ * Since this macro can be invoked with NULL as an output pointer,
+ * check if caller provided an output struct before storing output
+ * registers.
+ *
+ * Update output registers, even if the call failed (RAX != 0).
+ * Other registers may contain details of the failure.
+ */
+ test %r12, %r12
+ jz .Lno_output_struct
+
+ /* Copy result registers to output struct: */
+ movq %rcx, TDX_MODULE_rcx(%r12)
+ movq %rdx, TDX_MODULE_rdx(%r12)
+ movq %r8, TDX_MODULE_r8(%r12)
+ movq %r9, TDX_MODULE_r9(%r12)
+ movq %r10, TDX_MODULE_r10(%r12)
+ movq %r11, TDX_MODULE_r11(%r12)
+
+.Lno_output_struct:
+ /* Restore the state of R12 register */
+ pop %r12
+.endm
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5038edb79ad5..ca85d1409917 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -30,7 +30,6 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/edd.h>
-#include <linux/objtool.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -165,7 +164,6 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*bx &= maskebx;
}
-STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
static bool __init xen_check_mwait(void)
{
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 688aa8b6ae29..ba7af2eca755 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -260,8 +260,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
- if (ctxt == NULL)
+ if (ctxt == NULL) {
+ cpumask_clear_cpu(cpu, xen_cpu_initialized_map);
+ cpumask_clear_cpu(cpu, cpu_callout_mask);
return -ENOMEM;
+ }
gdt = get_cpu_gdt_rw(cpu);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ac17196e2518..3a2cd93bf059 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,6 +45,7 @@ SYM_CODE_END(hypercall_page)
__INIT
SYM_CODE_START(startup_xen)
UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
cld
/* Clear .bss */