summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig122
-rw-r--r--arch/x86/Kconfig.assembler5
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c22
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/hyperv/hv_apic.c41
-rw-r--r--arch/x86/hyperv/hv_init.c107
-rw-r--r--arch/x86/hyperv/hv_spinlock.c2
-rw-r--r--arch/x86/hyperv/hv_vtl.c2
-rw-r--r--arch/x86/hyperv/ivm.c263
-rw-r--r--arch/x86/include/asm/acpi.h24
-rw-r--r--arch/x86/include/asm/apic.h239
-rw-r--r--arch/x86/include/asm/audit.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/disabled-features.h16
-rw-r--r--arch/x86/include/asm/fpu/api.h9
-rw-r--r--arch/x86/include/asm/fpu/regset.h7
-rw-r--r--arch/x86/include/asm/fpu/sched.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h16
-rw-r--r--arch/x86/include/asm/fpu/xstate.h6
-rw-r--r--arch/x86/include/asm/hw_irq.h4
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h10
-rw-r--r--arch/x86/include/asm/idtentry.h3
-rw-r--r--arch/x86/include/asm/io.h5
-rw-r--r--arch/x86/include/asm/io_apic.h7
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/kexec.h18
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mpspec.h31
-rw-r--r--arch/x86/include/asm/mshyperv.h71
-rw-r--r--arch/x86/include/asm/pgtable.h330
-rw-r--r--arch/x86/include/asm/pgtable_types.h47
-rw-r--r--arch/x86/include/asm/processor.h9
-rw-r--r--arch/x86/include/asm/rmwcc.h11
-rw-r--r--arch/x86/include/asm/sections.h18
-rw-r--r--arch/x86/include/asm/shstk.h38
-rw-r--r--arch/x86/include/asm/smp.h14
-rw-r--r--arch/x86/include/asm/special_insns.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h27
-rw-r--r--arch/x86/include/asm/trap_pf.h2
-rw-r--r--arch/x86/include/asm/traps.h15
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/asm/xen/swiotlb-xen.h6
-rw-r--r--arch/x86/include/uapi/asm/mman.h4
-rw-r--r--arch/x86/include/uapi/asm/prctl.h12
-rw-r--r--arch/x86/kernel/Makefile14
-rw-r--r--arch/x86/kernel/acpi/boot.c12
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c454
-rw-r--r--arch/x86/kernel/apic/apic_common.c21
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c80
-rw-r--r--arch/x86/kernel/apic/apic_noop.c91
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c50
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c89
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c8
-rw-r--r--arch/x86/kernel/apic/init.c110
-rw-r--r--arch/x86/kernel/apic/io_apic.c30
-rw-r--r--arch/x86/kernel/apic/ipi.c176
-rw-r--r--arch/x86/kernel/apic/local.h30
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c117
-rw-r--r--arch/x86/kernel/apic/probe_64.c18
-rw-r--r--arch/x86/kernel/apic/vector.c116
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c23
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c74
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c51
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/audit_64.c5
-rw-r--r--arch/x86/kernel/cet.c131
-rw-r--r--arch/x86/kernel/cpu/acrn.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c45
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/hygon.c3
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c3
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c95
-rw-r--r--arch/x86/kernel/cpu/proc.c23
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c41
-rw-r--r--arch/x86/kernel/cpuid.c31
-rw-r--r--arch/x86/kernel/crash.c142
-rw-r--r--arch/x86/kernel/devicetree.c21
-rw-r--r--arch/x86/kernel/fpu/core.c54
-rw-r--r--arch/x86/kernel/fpu/regset.c81
-rw-r--r--arch/x86/kernel/fpu/xstate.c92
-rw-r--r--arch/x86/kernel/i8259.c4
-rw-r--r--arch/x86/kernel/ibt_selftest.S17
-rw-r--r--arch/x86/kernel/idt.c3
-rw-r--r--arch/x86/kernel/irq.c14
-rw-r--r--arch/x86/kernel/irq_work.c4
-rw-r--r--arch/x86/kernel/jailhouse.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c34
-rw-r--r--arch/x86/kernel/kvm.c14
-rw-r--r--arch/x86/kernel/mpparse.c20
-rw-r--r--arch/x86/kernel/msr.c31
-rw-r--r--arch/x86/kernel/nmi_selftest.c2
-rw-r--r--arch/x86/kernel/pci-dma.c29
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/setup_percpu.c10
-rw-r--r--arch/x86/kernel/sev.c4
-rw-r--r--arch/x86/kernel/shstk.c550
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c8
-rw-r--r--arch/x86/kernel/smp.c10
-rw-r--r--arch/x86/kernel/smpboot.c121
-rw-r--r--arch/x86/kernel/sys_x86_64.c6
-rw-r--r--arch/x86/kernel/traps.c87
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/lib/copy_user_64.S57
-rw-r--r--arch/x86/mm/fault.c29
-rw-r--r--arch/x86/mm/init.c9
-rw-r--r--arch/x86/mm/pat/set_memory.c4
-rw-r--r--arch/x86/mm/pgtable.c87
-rw-r--r--arch/x86/mm/srat.c5
-rw-r--r--arch/x86/mm/tlb.c21
-rw-r--r--arch/x86/net/bpf_jit_comp.c387
-rw-r--r--arch/x86/pci/amd_bus.c8
-rw-r--r--arch/x86/pci/bus_numa.c2
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/xen.c2
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c2
-rw-r--r--arch/x86/um/Makefile5
-rw-r--r--arch/x86/um/asm/mm_context.h2
-rw-r--r--arch/x86/um/checksum_32.S214
-rw-r--r--arch/x86/video/Makefile2
-rw-r--r--arch/x86/xen/apic.c85
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c16
-rw-r--r--arch/x86/xen/mmu_pv.c22
-rw-r--r--arch/x86/xen/setup.c10
-rw-r--r--arch/x86/xen/smp_pv.c5
-rw-r--r--arch/x86/xen/xen-asm.S2
142 files changed, 3659 insertions, 2320 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8d9e4b362572..982b777eadc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -128,7 +129,8 @@ config X86
select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
- select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64
+ select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64
+ select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
@@ -1813,6 +1815,11 @@ config CC_HAS_IBT
(CC_IS_CLANG && CLANG_VERSION >= 140000)) && \
$(as-instr,endbr64)
+config X86_CET
+ def_bool n
+ help
+ CET features configured (Shadow stack or IBT)
+
config X86_KERNEL_IBT
prompt "Indirect Branch Tracking"
def_bool y
@@ -1820,6 +1827,7 @@ config X86_KERNEL_IBT
# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
depends on !LD_IS_LLD || LLD_VERSION >= 140000
select OBJTOOL
+ select X86_CET
help
Build the kernel with support for Indirect Branch Tracking, a
hardware support course-grain forward-edge Control Flow Integrity
@@ -1913,6 +1921,24 @@ config X86_SGX
If unsure, say N.
+config X86_USER_SHADOW_STACK
+ bool "X86 userspace shadow stack"
+ depends on AS_WRUSS
+ depends on X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select X86_CET
+ help
+ Shadow stack protection is a hardware feature that detects function
+ return address corruption. This helps mitigate ROP attacks.
+ Applications must be enabled to use it, and old userspace does not
+ get protection "for free".
+
+ CPUs supporting shadow stacks were first released in 2020.
+
+ See Documentation/arch/x86/shstk.rst for more information.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2004,88 +2030,37 @@ config EFI_RUNTIME_MAP
source "kernel/Kconfig.hz"
-config KEXEC
- bool "kexec system call"
- select KEXEC_CORE
- help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel. It is like a reboot
- but it is independent of the system firmware. And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
+config ARCH_SUPPORTS_KEXEC
+ def_bool y
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you. As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
+config ARCH_SUPPORTS_KEXEC_FILE
+ def_bool X86_64 && CRYPTO && CRYPTO_SHA256
-config KEXEC_FILE
- bool "kexec file based system call"
- select KEXEC_CORE
+config ARCH_SELECTS_KEXEC_FILE
+ def_bool y
+ depends on KEXEC_FILE
select HAVE_IMA_KEXEC if IMA
- depends on X86_64
- depends on CRYPTO=y
- depends on CRYPTO_SHA256=y
- help
- This is new version of kexec system call. This system call is
- file based and takes file descriptors as system call argument
- for kernel and initramfs as opposed to list of segments as
- accepted by previous system call.
-config ARCH_HAS_KEXEC_PURGATORY
+config ARCH_SUPPORTS_KEXEC_PURGATORY
def_bool KEXEC_FILE
-config KEXEC_SIG
- bool "Verify kernel signature during kexec_file_load() syscall"
- depends on KEXEC_FILE
- help
-
- This option makes the kexec_file_load() syscall check for a valid
- signature of the kernel image. The image can still be loaded without
- a valid signature unless you also enable KEXEC_SIG_FORCE, though if
- there's a signature that we can check, then it must be valid.
+config ARCH_SUPPORTS_KEXEC_SIG
+ def_bool y
- In addition to this option, you need to enable signature
- verification for the corresponding kernel image type being
- loaded in order for this to work.
+config ARCH_SUPPORTS_KEXEC_SIG_FORCE
+ def_bool y
-config KEXEC_SIG_FORCE
- bool "Require a valid signature in kexec_file_load() syscall"
- depends on KEXEC_SIG
- help
- This option makes kernel signature verification mandatory for
- the kexec_file_load() syscall.
+config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
+ def_bool y
-config KEXEC_BZIMAGE_VERIFY_SIG
- bool "Enable bzImage signature verification support"
- depends on KEXEC_SIG
- depends on SIGNED_PE_FILE_VERIFICATION
- select SYSTEM_TRUSTED_KEYRING
- help
- Enable bzImage signature verification support.
+config ARCH_SUPPORTS_KEXEC_JUMP
+ def_bool y
-config CRASH_DUMP
- bool "kernel crash dumps"
- depends on X86_64 || (X86_32 && HIGHMEM)
- help
- Generate crash dump after being started by kexec.
- This should be normally only set in special crash dump kernels
- which are loaded in the main kernel with kexec-tools into
- a specially reserved region and then later executed after
- a crash by kdump/kexec. The crash dump kernel must be compiled
- to a memory address not used by the main kernel or BIOS using
- PHYSICAL_START, or it must be built as a relocatable image
- (CONFIG_RELOCATABLE=y).
- For more details see Documentation/admin-guide/kdump/kdump.rst
+config ARCH_SUPPORTS_CRASH_DUMP
+ def_bool X86_64 || (X86_32 && HIGHMEM)
-config KEXEC_JUMP
- bool "kexec jump"
- depends on KEXEC && HIBERNATION
- help
- Jump between original kernel and kexeced kernel and invoke
- code in physical address mode via KEXEC
+config ARCH_SUPPORTS_CRASH_HOTPLUG
+ def_bool y
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
@@ -2599,9 +2574,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
- def_bool y
-
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index b88f784cb02e..8ad41da301e5 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -24,3 +24,8 @@ config AS_GFNI
def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2)
help
Supported by binutils >= 2.30 and LLVM integrated assembler
+
+config AS_WRUSS
+ def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+ help
+ Supported by binutils >= 2.31 and LLVM integrated assembler
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index fdc2e3abd615..5bfe5caaa444 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -259,7 +259,7 @@ drivers-$(CONFIG_PCI) += arch/x86/pci/
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
-drivers-$(CONFIG_FB) += arch/x86/video/
+drivers-$(CONFIG_FB_CORE) += arch/x86/video/
####
# boot loader support. Several targets are kept for legacy purposes
@@ -335,9 +335,5 @@ define archhelp
echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
- echo ''
- echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
- echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
- echo ' x86_debug.config - Enable tip tree debugging options for testing'
endef
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index a5b0cb3efeba..39d6a62ac627 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -229,10 +229,9 @@ static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
return (struct crypto_aes_ctx *)ALIGN(addr, align);
}
-static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
+static int aes_set_key_common(struct crypto_aes_ctx *ctx,
const u8 *in_key, unsigned int key_len)
{
- struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
int err;
if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
@@ -253,7 +252,8 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
unsigned int key_len)
{
- return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
+ return aes_set_key_common(aes_ctx(crypto_tfm_ctx(tfm)), in_key,
+ key_len);
}
static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
@@ -285,8 +285,7 @@ static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int len)
{
- return aes_set_key_common(crypto_skcipher_tfm(tfm),
- crypto_skcipher_ctx(tfm), key, len);
+ return aes_set_key_common(aes_ctx(crypto_skcipher_ctx(tfm)), key, len);
}
static int ecb_encrypt(struct skcipher_request *req)
@@ -627,8 +626,7 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
- return aes_set_key_common(crypto_aead_tfm(aead),
- &ctx->aes_key_expanded, key, key_len) ?:
+ return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
@@ -893,14 +891,13 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
keylen /= 2;
/* first half of xts-key is for crypt */
- err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
- key, keylen);
+ err = aes_set_key_common(aes_ctx(ctx->raw_crypt_ctx), key, keylen);
if (err)
return err;
/* second half of xts-key is for tweak */
- return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
- key + keylen, keylen);
+ return aes_set_key_common(aes_ctx(ctx->raw_tweak_ctx), key + keylen,
+ keylen);
}
static int xts_crypt(struct skcipher_request *req, bool encrypt)
@@ -1150,8 +1147,7 @@ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
{
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
- return aes_set_key_common(crypto_aead_tfm(aead),
- &ctx->aes_key_expanded, key, key_len) ?:
+ return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
}
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 814768249eae..1d6eee30eceb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -374,6 +374,7 @@
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
451 common cachestat sys_cachestat
452 common fchmodat2 sys_fchmodat2
+453 64 map_shadow_stack sys_map_shadow_stack
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index b21335e6a210..97bfe5f0531f 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -86,14 +86,14 @@ static void hv_apic_write(u32 reg, u32 val)
}
}
-static void hv_apic_eoi_write(u32 reg, u32 val)
+static void hv_apic_eoi_write(void)
{
struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()];
if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1))
return;
- wrmsr(HV_X64_MSR_EOI, val, 0);
+ wrmsr(HV_X64_MSR_EOI, APIC_EOI_ACK, 0);
}
static bool cpu_is_self(int cpu)
@@ -175,8 +175,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
(exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
return true;
- if (!hv_hypercall_pg)
- return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
@@ -229,9 +232,15 @@ static bool __send_ipi_one(int cpu, int vector)
trace_hyperv_send_ipi_one(cpu, vector);
- if (!hv_hypercall_pg || (vp == VP_INVAL))
+ if (vp == VP_INVAL)
return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
+
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
@@ -286,12 +295,12 @@ void __init hv_apic_init(void)
*/
orig_apic = *apic;
- apic->send_IPI = hv_send_ipi;
- apic->send_IPI_mask = hv_send_ipi_mask;
- apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself;
- apic->send_IPI_allbutself = hv_send_ipi_allbutself;
- apic->send_IPI_all = hv_send_ipi_all;
- apic->send_IPI_self = hv_send_ipi_self;
+ apic_update_callback(send_IPI, hv_send_ipi);
+ apic_update_callback(send_IPI_mask, hv_send_ipi_mask);
+ apic_update_callback(send_IPI_mask_allbutself, hv_send_ipi_mask_allbutself);
+ apic_update_callback(send_IPI_allbutself, hv_send_ipi_allbutself);
+ apic_update_callback(send_IPI_all, hv_send_ipi_all);
+ apic_update_callback(send_IPI_self, hv_send_ipi_self);
}
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
@@ -308,12 +317,12 @@ void __init hv_apic_init(void)
* lazy EOI when available, but the same accessor works for
* both xapic and x2apic because the field layout is the same.
*/
- apic_set_eoi_write(hv_apic_eoi_write);
+ apic_update_callback(eoi, hv_apic_eoi_write);
if (!x2apic_enabled()) {
- apic->read = hv_apic_read;
- apic->write = hv_apic_write;
- apic->icr_write = hv_apic_icr_write;
- apic->icr_read = hv_apic_icr_read;
+ apic_update_callback(read, hv_apic_read);
+ apic_update_callback(write, hv_apic_write);
+ apic_update_callback(icr_write, hv_apic_icr_write);
+ apic_update_callback(icr_read, hv_apic_icr_read);
}
}
}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 953e280c07c3..783ed339f341 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -19,6 +19,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/idtentry.h>
+#include <asm/set_memory.h>
#include <linux/kexec.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -52,7 +53,7 @@ static int hyperv_init_ghcb(void)
void *ghcb_va;
void **ghcb_base;
- if (!hv_isolation_type_snp())
+ if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp())
return 0;
if (!hv_ghcb_pg)
@@ -80,7 +81,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
+ struct hv_vp_assist_page **hvp;
int ret;
ret = hv_common_cpu_init(cpu);
@@ -90,6 +91,7 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
+ hvp = &hv_vp_assist_page[cpu];
if (hv_root_partition) {
/*
* For root partition we get the hypervisor provided VP assist
@@ -107,8 +109,21 @@ static int hv_cpu_init(unsigned int cpu)
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
- if (!*hvp)
+ if (!*hvp) {
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+
+ /*
+ * Hyper-V should never specify a VM that is a Confidential
+ * VM and also running in the root partition. Root partition
+ * is blocked to run in Confidential VM. So only decrypt assist
+ * page in non-root partition here.
+ */
+ if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1));
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+ }
+
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
@@ -162,7 +177,7 @@ static inline bool hv_reenlightenment_available(void)
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment)
{
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(irq_hv_reenlightenment_count);
schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
}
@@ -379,6 +394,36 @@ static void __init hv_get_partition_id(void)
local_irq_restore(flags);
}
+static u8 __init get_vtl(void)
+{
+ u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+ struct hv_get_vp_registers_input *input;
+ struct hv_get_vp_registers_output *output;
+ unsigned long flags;
+ u64 ret;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = (struct hv_get_vp_registers_output *)input;
+
+ memset(input, 0, struct_size(input, element, 1));
+ input->header.partitionid = HV_PARTITION_ID_SELF;
+ input->header.vpindex = HV_VP_INDEX_SELF;
+ input->header.inputvtl = 0;
+ input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (hv_result_success(ret)) {
+ ret = output->as64.low & HV_X64_VTL_MASK;
+ } else {
+ pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret);
+ ret = 0;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -399,14 +444,24 @@ void __init hyperv_init(void)
if (hv_common_init())
return;
- hv_vp_assist_page = kcalloc(num_possible_cpus(),
- sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ /*
+ * The VP assist page is useless to a TDX guest: the only use we
+ * would have for it is lazy EOI, which can not be used with TDX.
+ */
+ if (hv_isolation_type_tdx())
+ hv_vp_assist_page = NULL;
+ else
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page),
+ GFP_KERNEL);
if (!hv_vp_assist_page) {
ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
- goto common_free;
+
+ if (!hv_isolation_type_tdx())
+ goto common_free;
}
- if (hv_isolation_type_snp()) {
+ if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
/* Negotiate GHCB Version. */
if (!hv_ghcb_negotiate_protocol())
hv_ghcb_terminate(SEV_TERM_SET_GEN,
@@ -426,12 +481,32 @@ void __init hyperv_init(void)
* Setup the hypercall page and enable hypercalls.
* 1. Register the guest ID
* 2. Enable the hypercall and register the hypercall page
+ *
+ * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg:
+ * when the hypercall input is a page, such a VM must pass a decrypted
+ * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page
+ * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present.
+ *
+ * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls,
+ * which are handled by the paravisor and the VM must use an encrypted
+ * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and
+ * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and
+ * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls:
+ * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8().
+ * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e.
+ * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg;
+ * instead, hv_post_message() uses the post_msg_page, which is decrypted
+ * in such a VM and is only used in such a VM.
*/
guest_id = hv_generate_guest_id(LINUX_VERSION_CODE);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
- /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+ /* With the paravisor, the VM must also write the ID via GHCB/GHCI */
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ goto skip_hypercall_pg_init;
hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
@@ -472,6 +547,7 @@ void __init hyperv_init(void)
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
}
+skip_hypercall_pg_init:
/*
* Some versions of Hyper-V that provide IBT in guest VMs have a bug
* in that there's no ENDBR64 instruction at the entry to the
@@ -527,11 +603,15 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+ /* Find the VTL */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp())
+ ms_hyperv.vtl = get_vtl();
+
return;
clean_guest_os_id:
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
cpuhp_remove_state(cpuhp);
free_ghcb_page:
free_percpu(hv_ghcb_pg);
@@ -552,7 +632,7 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
/*
* Reset hypercall page reference before reset the page,
@@ -615,6 +695,9 @@ bool hv_is_hyperv_initialized(void)
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return false;
+ /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ return true;
/*
* Verify that earlier initialization succeeded by checking
* that the hypercall page is setup
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 91cfe698bde0..737d6f7a6155 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -20,7 +20,7 @@ static bool __initdata hv_pvspin = true;
static void hv_qlock_kick(int cpu)
{
- apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR);
+ __apic_send_IPI(cpu, X86_PLATFORM_IPI_VECTOR);
}
static void hv_qlock_wait(u8 *byte, u8 val)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index db5d2ea39fc0..36a562218010 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -226,7 +226,7 @@ static int __init hv_vtl_early_init(void)
"Please add 'noxsave' to the kernel command line.\n");
real_mode_header = &hv_vtl_real_mode_header;
- apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu;
+ apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu);
return 0;
}
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 28be6df88063..8c6bf07f7d2b 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -18,6 +18,11 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/mtrr.h>
+#include <asm/io_apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/desc.h>
+#include <uapi/asm/vmx.h>
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -56,8 +61,10 @@ union hv_ghcb {
} hypercall;
} __packed __aligned(HV_HYP_PAGE_SIZE);
+/* Only used in an SNP VM with the paravisor */
static u16 hv_ghcb_version __ro_after_init;
+/* Functions only used in an SNP VM with the paravisor go here. */
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
{
union hv_ghcb *hv_ghcb;
@@ -175,7 +182,7 @@ bool hv_ghcb_negotiate_protocol(void)
return true;
}
-void hv_ghcb_msr_write(u64 msr, u64 value)
+static void hv_ghcb_msr_write(u64 msr, u64 value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -203,9 +210,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value)
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
-void hv_ghcb_msr_read(u64 msr, u64 *value)
+static void hv_ghcb_msr_read(u64 msr, u64 *value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -235,7 +241,217 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
| ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
+
+/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa);
+
+/* Functions only used in an SNP VM without the paravisor go here. */
+
+#define hv_populate_vmcb_seg(seg, gdtr_base) \
+do { \
+ if (seg.selector) { \
+ seg.base = 0; \
+ seg.limit = HV_AP_SEGMENT_LIMIT; \
+ seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \
+ seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \
+ } \
+} while (0) \
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+{
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct sev_es_save_area *cur_vmsa;
+ struct desc_ptr gdtr;
+ u64 ret, retry = 5;
+ struct hv_enable_vp_vtl *start_vp_input;
+ unsigned long flags;
+
+ if (!vmsa)
+ return -ENOMEM;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base);
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base);
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base);
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base);
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
+ asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
+ asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)secondary_startup_64_no_verify;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ ret = snp_set_vmsa(vmsa, true);
+ if (!ret) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ free_page((u64)vmsa);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partition_id = -1;
+ start_vp_input->vp_index = cpu;
+ start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VP,
+ start_vp_input, NULL);
+ } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(ret)) {
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ cur_vmsa = per_cpu(hv_sev_vmsa, cpu);
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(hv_sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+static void hv_tdx_msr_write(u64 msr, u64 val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_WRITE,
+ .r12 = msr,
+ .r13 = val,
+ };
+
+ u64 ret = __tdx_hypercall(&args);
+
+ WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret);
+}
+
+static void hv_tdx_msr_read(u64 msr, u64 *val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_READ,
+ .r12 = msr,
+ };
+
+ u64 ret = __tdx_hypercall_ret(&args);
+
+ if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
+ *val = 0;
+ else
+ *val = args.r11;
+}
+
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ struct tdx_hypercall_args args = { };
+
+ args.r10 = control;
+ args.rdx = param1;
+ args.r8 = param2;
+
+ (void)__tdx_hypercall_ret(&args);
+
+ return args.r11;
+}
+
+#else
+static inline void hv_tdx_msr_write(u64 msr, u64 value) {}
+static inline void hv_tdx_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_ivm_msr_write(u64 msr, u64 value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_write(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_write(msr, value);
+}
+
+void hv_ivm_msr_read(u64 msr, u64 *value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_read(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_read(msr, value);
+}
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -358,13 +574,34 @@ static bool hv_is_private_mmio(u64 addr)
void __init hv_vtom_init(void)
{
+ enum hv_isolation_type type = hv_get_isolation_type();
+
+ switch (type) {
+ case HV_ISOLATION_TYPE_VBS:
+ fallthrough;
/*
* By design, a VM using vTOM doesn't see the SEV setting,
* so SEV initialization is bypassed and sev_status isn't set.
* Set it here to indicate a vTOM VM.
+ *
+ * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is
+ * defined as 0ULL, to which we can't assigned a value.
*/
- sev_status = MSR_AMD64_SNP_VTOM;
- cc_vendor = CC_VENDOR_AMD;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ case HV_ISOLATION_TYPE_SNP:
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_vendor = CC_VENDOR_AMD;
+ break;
+#endif
+
+ case HV_ISOLATION_TYPE_TDX:
+ cc_vendor = CC_VENDOR_INTEL;
+ break;
+
+ default:
+ panic("hv_vtom_init: unsupported isolation type %d\n", type);
+ }
+
cc_set_mask(ms_hyperv.shared_gpa_boundary);
physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
@@ -377,7 +614,7 @@ void __init hv_vtom_init(void)
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
}
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
+#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
enum hv_isolation_type hv_get_isolation_type(void)
{
@@ -405,10 +642,20 @@ bool hv_is_isolation_supported(void)
DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
/*
- * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based
* isolation VM.
*/
bool hv_isolation_type_snp(void)
{
return static_branch_unlikely(&isolation_type_snp);
}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_tdx);
+/*
+ * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based
+ * isolated VM.
+ */
+bool hv_isolation_type_tdx(void)
+{
+ return static_branch_unlikely(&isolation_type_tdx);
+}
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2888c0ee4df0..c8a7fc23f63c 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -6,7 +6,7 @@
* Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
*/
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <asm/numa.h>
#include <asm/fixmap.h>
@@ -102,23 +102,31 @@ static inline bool arch_has_acpi_pdc(void)
c->x86_vendor == X86_VENDOR_CENTAUR);
}
-static inline void arch_acpi_set_pdc_bits(u32 *buf)
+static inline void arch_acpi_set_proc_cap_bits(u32 *cap)
{
struct cpuinfo_x86 *c = &cpu_data(0);
- buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
+ *cap |= ACPI_PROC_CAP_C_CAPABILITY_SMP;
+
+ /* Enable coordination with firmware's _TSD info */
+ *cap |= ACPI_PROC_CAP_SMP_T_SWCOORD;
if (cpu_has(c, X86_FEATURE_EST))
- buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
+ *cap |= ACPI_PROC_CAP_EST_CAPABILITY_SWSMP;
if (cpu_has(c, X86_FEATURE_ACPI))
- buf[2] |= ACPI_PDC_T_FFH;
+ *cap |= ACPI_PROC_CAP_T_FFH;
+
+ if (cpu_has(c, X86_FEATURE_HWP))
+ *cap |= ACPI_PROC_CAP_COLLAB_PROC_PERF;
/*
- * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ * If mwait/monitor is unsupported, C_C1_FFH and
+ * C2/C3_FFH will be disabled.
*/
- if (!cpu_has(c, X86_FEATURE_MWAIT))
- buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+ if (!cpu_has(c, X86_FEATURE_MWAIT) ||
+ boot_option_idle_override == IDLE_NOMWAIT)
+ *cap &= ~(ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH);
}
static inline bool acpi_has_cpu_in_madt(void)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98c32aa5963a..5af4ec1a0f71 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -3,6 +3,7 @@
#define _ASM_X86_APIC_H
#include <linux/cpumask.h>
+#include <linux/static_call.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
@@ -40,11 +41,9 @@
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
-extern void generic_apic_probe(void);
+extern void x86_32_probe_apic(void);
#else
-static inline void generic_apic_probe(void)
-{
-}
+static inline void x86_32_probe_apic(void) { }
#endif
#ifdef CONFIG_X86_LOCAL_APIC
@@ -52,7 +51,7 @@ static inline void generic_apic_probe(void)
extern int apic_verbosity;
extern int local_apic_timer_c2_ok;
-extern int disable_apic;
+extern bool apic_is_disabled;
extern unsigned int lapic_timer_period;
extern int cpuid_to_apicid[];
@@ -66,20 +65,6 @@ enum apic_intr_mode_id {
APIC_SYMMETRIC_IO_NO_ROUTING
};
-#ifdef CONFIG_SMP
-extern void __inquire_remote_apic(int apicid);
-#else /* CONFIG_SMP */
-static inline void __inquire_remote_apic(int apicid)
-{
-}
-#endif /* CONFIG_SMP */
-
-static inline void default_inquire_remote_apic(int apicid)
-{
- if (apic_verbosity >= APIC_DEBUG)
- __inquire_remote_apic(apicid);
-}
-
/*
* With 82489DX we can't rely on apic feature bit
* retrieved via cpuid but still have to deal with
@@ -90,7 +75,7 @@ static inline void default_inquire_remote_apic(int apicid)
*/
static inline bool apic_from_smp_config(void)
{
- return smp_found_config && !disable_apic;
+ return smp_found_config && !apic_is_disabled;
}
/*
@@ -114,8 +99,11 @@ static inline u32 native_apic_mem_read(u32 reg)
return *((volatile u32 *)(APIC_BASE + reg));
}
-extern void native_apic_wait_icr_idle(void);
-extern u32 native_safe_apic_wait_icr_idle(void);
+static inline void native_apic_mem_eoi(void)
+{
+ native_apic_mem_write(APIC_EOI, APIC_EOI_ACK);
+}
+
extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);
@@ -149,12 +137,12 @@ extern void setup_secondary_APIC_clock(void);
extern void lapic_update_tsc_freq(void);
#ifdef CONFIG_X86_64
-static inline int apic_force_enable(unsigned long addr)
+static inline bool apic_force_enable(unsigned long addr)
{
- return -1;
+ return false;
}
#else
-extern int apic_force_enable(unsigned long addr);
+extern bool apic_force_enable(unsigned long addr);
#endif
extern void apic_ap_setup(void);
@@ -207,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
}
-static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
+static inline void native_apic_msr_eoi(void)
{
__wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
}
@@ -223,18 +211,6 @@ static inline u32 native_apic_msr_read(u32 reg)
return (u32)msr;
}
-static inline void native_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return;
-}
-
-static inline u32 native_safe_x2apic_wait_icr_idle(void)
-{
- /* no need to wait for icr idle in x2apic */
- return 0;
-}
-
static inline void native_x2apic_icr_write(u32 low, u32 id)
{
wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
@@ -261,7 +237,7 @@ static inline int x2apic_enabled(void)
#else /* !CONFIG_X86_X2APIC */
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }
-
+static inline u32 native_apic_msr_read(u32 reg) { BUG(); }
#define x2apic_mode (0)
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
@@ -280,8 +256,8 @@ struct irq_data;
*/
struct apic {
/* Hotpath functions first */
- void (*eoi_write)(u32 reg, u32 v);
- void (*native_eoi_write)(u32 reg, u32 v);
+ void (*eoi)(void);
+ void (*native_eoi)(void);
void (*write)(u32 reg, u32 v);
u32 (*read)(u32 reg);
@@ -296,10 +272,11 @@ struct apic {
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
- u32 disable_esr;
-
enum apic_delivery_modes delivery_mode;
- bool dest_mode_logical;
+
+ u32 disable_esr : 1,
+ dest_mode_logical : 1,
+ x2apic_set_max_apicid : 1;
u32 (*calc_dest_apicid)(unsigned int cpu);
@@ -307,19 +284,18 @@ struct apic {
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
+ /* The limit of the APIC ID space. */
+ u32 max_apic_id;
+
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
- int (*apic_id_valid)(u32 apicid);
- int (*apic_id_registered)(void);
+ bool (*apic_id_registered)(void);
bool (*check_apicid_used)(physid_mask_t *map, int apicid);
void (*init_apic_ldr)(void);
void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
- void (*setup_apic_routing)(void);
int (*cpu_present_to_apicid)(int mps_cpu);
- void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
- int (*check_phys_apicid_present)(int phys_apicid);
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
u32 (*get_apic_id)(unsigned long x);
@@ -330,24 +306,26 @@ struct apic {
/* wakeup secondary CPU using 64-bit wakeup point */
int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
- void (*inquire_remote_apic)(int apicid);
-
-#ifdef CONFIG_X86_32
- /*
- * Called very early during boot from get_smp_config(). It should
- * return the logical apicid. x86_[bios]_cpu_to_apicid is
- * initialized before this function is called.
- *
- * If logical apicid can't be determined that early, the function
- * may return BAD_APICID. Logical apicid will be configured after
- * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
- * won't be applied properly during early boot in this case.
- */
- int (*x86_32_early_logical_apicid)(int cpu);
-#endif
char *name;
};
+struct apic_override {
+ void (*eoi)(void);
+ void (*native_eoi)(void);
+ void (*write)(u32 reg, u32 v);
+ u32 (*read)(u32 reg);
+ void (*send_IPI)(int cpu, int vector);
+ void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+ void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
+ void (*send_IPI_allbutself)(int vector);
+ void (*send_IPI_all)(int vector);
+ void (*send_IPI_self)(int vector);
+ u64 (*icr_read)(void);
+ void (*icr_write)(u32 low, u32 high);
+ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
+};
+
/*
* Pointer to the local APIC driver in use on this system (there's
* always just one such driver in use - the kernel decides via an
@@ -383,43 +361,111 @@ extern int lapic_can_unplug_cpu(void);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
+extern struct apic_override __x86_apic_override;
-static inline u32 apic_read(u32 reg)
+void __init apic_setup_apic_calls(void);
+void __init apic_install_driver(struct apic *driver);
+
+#define apic_update_callback(_callback, _fn) { \
+ __x86_apic_override._callback = _fn; \
+ apic->_callback = _fn; \
+ static_call_update(apic_call_##_callback, _fn); \
+ pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \
+}
+
+#define DECLARE_APIC_CALL(__cb) \
+ DECLARE_STATIC_CALL(apic_call_##__cb, *apic->__cb)
+
+DECLARE_APIC_CALL(eoi);
+DECLARE_APIC_CALL(native_eoi);
+DECLARE_APIC_CALL(icr_read);
+DECLARE_APIC_CALL(icr_write);
+DECLARE_APIC_CALL(read);
+DECLARE_APIC_CALL(send_IPI);
+DECLARE_APIC_CALL(send_IPI_mask);
+DECLARE_APIC_CALL(send_IPI_mask_allbutself);
+DECLARE_APIC_CALL(send_IPI_allbutself);
+DECLARE_APIC_CALL(send_IPI_all);
+DECLARE_APIC_CALL(send_IPI_self);
+DECLARE_APIC_CALL(wait_icr_idle);
+DECLARE_APIC_CALL(wakeup_secondary_cpu);
+DECLARE_APIC_CALL(wakeup_secondary_cpu_64);
+DECLARE_APIC_CALL(write);
+
+static __always_inline u32 apic_read(u32 reg)
+{
+ return static_call(apic_call_read)(reg);
+}
+
+static __always_inline void apic_write(u32 reg, u32 val)
+{
+ static_call(apic_call_write)(reg, val);
+}
+
+static __always_inline void apic_eoi(void)
+{
+ static_call(apic_call_eoi)();
+}
+
+static __always_inline void apic_native_eoi(void)
{
- return apic->read(reg);
+ static_call(apic_call_native_eoi)();
}
-static inline void apic_write(u32 reg, u32 val)
+static __always_inline u64 apic_icr_read(void)
{
- apic->write(reg, val);
+ return static_call(apic_call_icr_read)();
}
-static inline void apic_eoi(void)
+static __always_inline void apic_icr_write(u32 low, u32 high)
{
- apic->eoi_write(APIC_EOI, APIC_EOI_ACK);
+ static_call(apic_call_icr_write)(low, high);
}
-static inline u64 apic_icr_read(void)
+static __always_inline void __apic_send_IPI(int cpu, int vector)
{
- return apic->icr_read();
+ static_call(apic_call_send_IPI)(cpu, vector);
}
-static inline void apic_icr_write(u32 low, u32 high)
+static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
- apic->icr_write(low, high);
+ static_call_mod(apic_call_send_IPI_mask)(mask, vector);
}
-static inline void apic_wait_icr_idle(void)
+static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
- apic->wait_icr_idle();
+ static_call(apic_call_send_IPI_mask_allbutself)(mask, vector);
}
-static inline u32 safe_apic_wait_icr_idle(void)
+static __always_inline void __apic_send_IPI_allbutself(int vector)
{
- return apic->safe_wait_icr_idle();
+ static_call(apic_call_send_IPI_allbutself)(vector);
}
-extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
+static __always_inline void __apic_send_IPI_all(int vector)
+{
+ static_call(apic_call_send_IPI_all)(vector);
+}
+
+static __always_inline void __apic_send_IPI_self(int vector)
+{
+ static_call_mod(apic_call_send_IPI_self)(vector);
+}
+
+static __always_inline void apic_wait_icr_idle(void)
+{
+ static_call_cond(apic_call_wait_icr_idle)();
+}
+
+static __always_inline u32 safe_apic_wait_icr_idle(void)
+{
+ return apic->safe_wait_icr_idle ? apic->safe_wait_icr_idle() : 0;
+}
+
+static __always_inline bool apic_id_valid(u32 apic_id)
+{
+ return apic_id <= apic->max_apic_id;
+}
#else /* CONFIG_X86_LOCAL_APIC */
@@ -430,22 +476,16 @@ static inline u64 apic_icr_read(void) { return 0; }
static inline void apic_icr_write(u32 low, u32 high) { }
static inline void apic_wait_icr_idle(void) { }
static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
-static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
+static inline void apic_set_eoi_cb(void (*eoi)(void)) {}
+static inline void apic_native_eoi(void) { WARN_ON_ONCE(1); }
+static inline void apic_setup_apic_calls(void) { }
+
+#define apic_update_callback(_callback, _fn) do { } while (0)
#endif /* CONFIG_X86_LOCAL_APIC */
extern void apic_ack_irq(struct irq_data *data);
-static inline void ack_APIC_irq(void)
-{
- /*
- * ack_APIC_irq() actually gets compiled as a single instruction
- * ... yummie.
- */
- apic_eoi();
-}
-
-
static inline bool lapic_vector_set_in_irr(unsigned int vector)
{
u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -475,10 +515,6 @@ extern void generic_bigsmp_probe(void);
#include <asm/smp.h>
-#define APIC_DFR_VALUE (APIC_DFR_FLAT)
-
-DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
-
extern struct apic apic_noop;
static inline unsigned int read_apic_id(void)
@@ -490,12 +526,14 @@ static inline unsigned int read_apic_id(void)
#ifdef CONFIG_X86_64
typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip);
-extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler);
+extern int default_acpi_madt_oem_check(char *, char *);
+extern void x86_64_probe_apic(void);
+#else
+static inline int default_acpi_madt_oem_check(char *a, char *b) { return 0; }
+static inline void x86_64_probe_apic(void) { }
#endif
extern int default_apic_id_valid(u32 apicid);
-extern int default_acpi_madt_oem_check(char *, char *);
-extern void default_setup_apic_routing(void);
extern u32 apic_default_calc_apicid(unsigned int cpu);
extern u32 apic_flat_calc_apicid(unsigned int cpu);
@@ -503,9 +541,12 @@ extern u32 apic_flat_calc_apicid(unsigned int cpu);
extern bool default_check_apicid_used(physid_mask_t *map, int apicid);
extern void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap);
extern int default_cpu_present_to_apicid(int mps_cpu);
-extern int default_check_phys_apicid_present(int phys_apicid);
-#endif /* CONFIG_X86_LOCAL_APIC */
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline unsigned int read_apic_id(void) { return 0; }
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_SMP
void apic_smt_update(void);
diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h
index 36aec57ea7a3..fa918f01333e 100644
--- a/arch/x86/include/asm/audit.h
+++ b/arch/x86/include/asm/audit.h
@@ -4,4 +4,11 @@
int ia32_classify_syscall(unsigned int syscall);
+extern unsigned ia32_dir_class[];
+extern unsigned ia32_write_class[];
+extern unsigned ia32_read_class[];
+extern unsigned ia32_chattr_class[];
+extern unsigned ia32_signal_class[];
+
+
#endif /* _ASM_X86_AUDIT_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b69b0d7756aa..2061ed1c398f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -198,7 +198,6 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
-#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */
@@ -308,6 +307,7 @@
#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
@@ -384,6 +384,7 @@
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index fafe9be7a6f4..702d93fdd10e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -105,6 +105,18 @@
# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
#endif
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+#define DISABLE_USER_SHSTK 0
+#else
+#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31))
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+#define DISABLE_IBT 0
+#else
+#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -120,7 +132,7 @@
#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
- DISABLE_CALL_DEPTH_TRACKING)
+ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
#define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
@@ -128,7 +140,7 @@
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
-#define DISABLED_MASK18 0
+#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK19 0
#define DISABLED_MASK20 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index b475d9a582b8..31089b851c4f 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -82,6 +82,15 @@ static inline void fpregs_unlock(void)
preempt_enable();
}
+/*
+ * FPU state gets lazily restored before returning to userspace. So when in the
+ * kernel, the valid FPU state may be kept in the buffer. This function will force
+ * restore all the fpu state to the registers early if needed, and lock them from
+ * being automatically saved/restored. Then FPU state can be modified safely in the
+ * registers, before unlocking with fpregs_unlock().
+ */
+void fpregs_lock_and_load(void);
+
#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index 4f928d6a367b..697b77e96025 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -7,11 +7,12 @@
#include <linux/regset.h>
-extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active,
+ ssp_active;
extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
- xstateregs_get;
+ xstateregs_get, ssp_get;
extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
- xstateregs_set;
+ xstateregs_set, ssp_set;
/*
* xstateregs_active == regset_fpregs_active. Please refer to the comment
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index 78fcde7b1f07..ca6e5e5f16b2 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -11,7 +11,8 @@
extern void save_fpregs_to_fpstate(struct fpu *fpu);
extern void fpu__drop(struct fpu *fpu);
-extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal);
+extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+ unsigned long shstk_addr);
extern void fpu_flush_thread(void);
/*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 7f6d858ff47a..eb810074f1e7 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -115,8 +115,8 @@ enum xfeature {
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_PASID,
- XFEATURE_RSRVD_COMP_11,
- XFEATURE_RSRVD_COMP_12,
+ XFEATURE_CET_USER,
+ XFEATURE_CET_KERNEL_UNUSED,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -138,6 +138,8 @@ enum xfeature {
#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -253,6 +255,16 @@ struct pkru_state {
} __packed;
/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+ /* user control-flow settings */
+ u64 user_cet;
+ /* user shadow stack pointer */
+ u64 user_ssp;
+};
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index cd3dd170e23a..d4427b88ee12 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,7 +50,8 @@
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
/* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER)
/*
* A supervisor state component may not always contain valuable information,
@@ -77,7 +78,8 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
+ XFEATURE_MASK_CET_KERNEL)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index d465ece58151..551829884734 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -97,10 +97,10 @@ extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
extern void lock_vector_lock(void);
extern void unlock_vector_lock(void);
#ifdef CONFIG_SMP
-extern void send_cleanup_vector(struct irq_cfg *);
+extern void vector_schedule_cleanup(struct irq_cfg *);
extern void irq_complete_move(struct irq_cfg *cfg);
#else
-static inline void send_cleanup_vector(struct irq_cfg *c) { }
+static inline void vector_schedule_cleanup(struct irq_cfg *c) { }
static inline void irq_complete_move(struct irq_cfg *c) { }
#endif
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cea95dcd27c2..2ff26f53cd62 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -169,7 +169,8 @@
enum hv_isolation_type {
HV_ISOLATION_TYPE_NONE = 0,
HV_ISOLATION_TYPE_VBS = 1,
- HV_ISOLATION_TYPE_SNP = 2
+ HV_ISOLATION_TYPE_SNP = 2,
+ HV_ISOLATION_TYPE_TDX = 3
};
/* Hyper-V specific model specific registers (MSRs) */
@@ -301,6 +302,13 @@ enum hv_isolation_type {
#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/*
+ * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and
+ * there is not associated MSR address.
+ */
+#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003
+#define HV_X64_VTL_MASK GENMASK(3, 0)
+
/* Hyper-V memory host visibility */
enum hv_mem_host_visibility {
VMBUS_PAGE_NOT_VISIBLE = 0,
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b241af4ce9b4..05fd175cec7d 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -614,7 +614,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
#endif
/* #CP */
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
#endif
@@ -648,7 +648,6 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR, sysvec_x86_platform_ipi);
#ifdef CONFIG_SMP
DECLARE_IDTENTRY(RESCHEDULE_VECTOR, sysvec_reschedule_ipi);
-DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup);
DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot);
DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single);
DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index e9025640f634..76238842406a 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -35,9 +35,6 @@
* - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*/
-#define ARCH_HAS_IOREMAP_WC
-#define ARCH_HAS_IOREMAP_WT
-
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/cc_platform.h>
@@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t);
#define memcpy_toio memcpy_toio
#define memset_io memset_io
-#include <asm-generic/iomap.h>
-
/*
* ISA space is 'always mapped' on a typical x86 system, no need to
* explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 437aa8d00e53..51c782600e02 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -109,8 +109,8 @@ extern int mp_irq_entries;
/* MP IRQ source entries */
extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
-/* 1 if "noapic" boot option passed */
-extern int skip_ioapic_setup;
+/* True if "noapic" boot option passed */
+extern bool ioapic_is_disabled;
/* 1 if "noapic" boot option passed */
extern int noioapicquirk;
@@ -129,7 +129,7 @@ extern unsigned long io_apic_irqs;
* assignment of PCI IRQ's.
*/
#define io_apic_assign_pci_irqs \
- (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
+ (mp_irq_entries && !ioapic_is_disabled && io_apic_irqs)
struct irq_cfg;
extern void ioapic_insert_resources(void);
@@ -179,6 +179,7 @@ extern void print_IO_APICs(void);
#define IO_APIC_IRQ(x) 0
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
+#define nr_ioapics (0)
static inline void ioapic_insert_resources(void) { }
static inline int arch_early_ioapic_init(void) { return 0; }
static inline void print_IO_APICs(void) {}
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 29e083b92813..836c170d3087 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -42,7 +42,7 @@ extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
- bool exclude_self);
+ int exclude_cpu);
#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 43dcb9284208..3a19904c2db6 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -35,13 +35,6 @@
*/
#define FIRST_EXTERNAL_VECTOR 0x20
-/*
- * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
- * triggering cleanup after irq migration. 0x21-0x2f will still be used
- * for device interrupts.
- */
-#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
-
#define IA32_SYSCALL_VECTOR 0x80
/*
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 5b77bbc28f96..3be6a98751f0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -209,6 +209,24 @@ typedef void crash_vmclear_fn(void);
extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
extern void kdump_nmi_shootdown_cpus(void);
+#ifdef CONFIG_CRASH_HOTPLUG
+void arch_crash_handle_hotplug_event(struct kimage *image);
+#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
+
+#ifdef CONFIG_HOTPLUG_CPU
+int arch_crash_hotplug_cpu_support(void);
+#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_crash_hotplug_memory_support(void);
+#define crash_hotplug_memory_support arch_crash_hotplug_memory_support
+#endif
+
+unsigned int arch_crash_get_elfcorehdr_size(void);
+#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1d29dc791f5a..416901d406f8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -186,6 +186,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index e90ac7e9ae2c..f46df8349e86 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -23,8 +23,6 @@ extern int pic_mode;
#define MAX_IRQ_SOURCES 256
-extern unsigned int def_to_bigsmp;
-
#else /* CONFIG_X86_64: */
#define MAX_MP_BUSSES 256
@@ -41,7 +39,6 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
extern u8 boot_cpu_apic_version;
-extern unsigned long mp_lapic_addr;
#ifdef CONFIG_X86_LOCAL_APIC
extern int smp_found_config;
@@ -76,7 +73,7 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
#define default_get_smp_config x86_init_uint_noop
#endif
-int generic_processor_info(int apicid, int version);
+int generic_processor_info(int apicid);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
@@ -87,13 +84,7 @@ struct physid_mask {
typedef struct physid_mask physid_mask_t;
#define physid_set(physid, map) set_bit(physid, (map).mask)
-#define physid_clear(physid, map) clear_bit(physid, (map).mask)
#define physid_isset(physid, map) test_bit(physid, (map).mask)
-#define physid_test_and_set(physid, map) \
- test_and_set_bit(physid, (map).mask)
-
-#define physids_and(dst, src1, src2) \
- bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
#define physids_or(dst, src1, src2) \
bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
@@ -101,29 +92,9 @@ typedef struct physid_mask physid_mask_t;
#define physids_clear(map) \
bitmap_zero((map).mask, MAX_LOCAL_APIC)
-#define physids_complement(dst, src) \
- bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
-
#define physids_empty(map) \
bitmap_empty((map).mask, MAX_LOCAL_APIC)
-#define physids_equal(map1, map2) \
- bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
-
-#define physids_weight(map) \
- bitmap_weight((map).mask, MAX_LOCAL_APIC)
-
-#define physids_shift_right(d, s, n) \
- bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
-
-#define physids_shift_left(d, s, n) \
- bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
-
-static inline unsigned long physids_coerce(physid_mask_t *map)
-{
- return map->mask[0];
-}
-
static inline void physids_promote(unsigned long physids, physid_mask_t *map)
{
physids_clear(*map);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fa83d88e4c99..033b53f993c6 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -26,6 +26,7 @@
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_tdx);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -40,6 +41,7 @@ static inline unsigned char hv_get_nmi_reason(void)
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
+extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
@@ -47,10 +49,25 @@ extern u64 hv_current_partition_id;
extern union hv_ghcb * __percpu *hv_ghcb_pg;
+bool hv_isolation_type_snp(void);
+bool hv_isolation_type_tdx(void);
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+
+/*
+ * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA
+ * to start AP in enlightened SEV guest.
+ */
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+/*
+ * If the hypercall involves no input or output parameters, the hypervisor
+ * ignores the corresponding GPA pointer.
+ */
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
u64 input_address = input ? virt_to_phys(input) : 0;
@@ -58,6 +75,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
u64 hv_status;
#ifdef CONFIG_X86_64
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input_address, output_address);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+ return hv_status;
+ }
+
if (!hv_hypercall_pg)
return U64_MAX;
@@ -101,7 +131,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, 0);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__(
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ :: "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__(CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
@@ -146,7 +185,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, input2);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2)
+ : "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__("mov %4, %%r8\n"
CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
@@ -225,20 +274,24 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
-void hv_ghcb_msr_write(u64 msr, u64 value);
-void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-void hv_vtom_init(void);
+int hv_snp_boot_ap(int cpu, unsigned long start_ip);
#else
-static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
-static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline void hv_vtom_init(void) {}
+static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; }
#endif
-extern bool hv_isolation_type_snp(void);
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_vtom_init(void);
+void hv_ivm_msr_write(u64 msr, u64 value);
+void hv_ivm_msr_read(u64 msr, u64 *value);
+#else
+static inline void hv_vtom_init(void) {}
+static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
+static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+#endif
static inline bool hv_is_synic_reg(unsigned int reg)
{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5700bb337987..d6ad98ca1288 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -125,9 +125,15 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pte_shstk(pte_t pte)
+{
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
@@ -135,9 +141,16 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ return cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
}
#define pmd_young pmd_young
@@ -146,9 +159,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -158,7 +171,27 @@ static inline int pud_young(pud_t pud)
static inline int pte_write(pte_t pte)
{
- return pte_flags(pte) & _PAGE_RW;
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
+}
+
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
}
static inline int pte_huge(pte_t pte)
@@ -185,6 +218,8 @@ static inline int pte_special(pte_t pte)
static inline u64 protnone_mask(u64 val);
+#define PFN_PTE_SHIFT PAGE_SHIFT
+
static inline unsigned long pte_pfn(pte_t pte)
{
phys_addr_t pfn = pte_val(pte);
@@ -290,9 +325,63 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+/*
+ * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
+ * case of X86_FEATURE_USER_SHSTK, these PTEs denote shadow stack memory. So
+ * when creating dirty, write-protected memory, a software bit is used:
+ * _PAGE_BIT_SAVED_DIRTY. The following functions take a PTE and transition the
+ * Dirty bit to SavedDirty, and vice-vesra.
+ *
+ * This shifting is only done if needed. In the case of shifting
+ * Dirty->SavedDirty, the condition is if the PTE is Write=0. In the case of
+ * shifting SavedDirty->Dirty, the condition is Write=1.
+ */
+static inline pgprotval_t mksaveddirty_shift(pgprotval_t v)
+{
+ pgprotval_t cond = (~v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_DIRTY) & cond) << _PAGE_BIT_SAVED_DIRTY;
+ v &= ~(cond << _PAGE_BIT_DIRTY);
+
+ return v;
+}
+
+static inline pgprotval_t clear_saveddirty_shift(pgprotval_t v)
+{
+ pgprotval_t cond = (v >> _PAGE_BIT_RW) & 1;
+
+ v |= ((v >> _PAGE_BIT_SAVED_DIRTY) & cond) << _PAGE_BIT_DIRTY;
+ v &= ~(cond << _PAGE_BIT_SAVED_DIRTY);
+
+ return v;
+}
+
+static inline pte_t pte_mksaveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pte(v);
+}
+
+static inline pte_t pte_clear_saveddirty(pte_t pte)
+{
+ pteval_t v = native_pte_val(pte);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pte(v);
+}
+
static inline pte_t pte_wrprotect(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_RW);
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
+ * dirty value to the software bit, if present.
+ */
+ return pte_mksaveddirty(pte);
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -330,7 +419,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -345,7 +434,16 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pte = pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pte_mksaveddirty(pte);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ return pte_set_flags(pte, _PAGE_DIRTY);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -353,11 +451,15 @@ static inline pte_t pte_mkyoung(pte_t pte)
return pte_set_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_novma(pte_t pte)
{
return pte_set_flags(pte, _PAGE_RW);
}
+struct vm_area_struct;
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
+#define pte_mkwrite pte_mkwrite
+
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte_set_flags(pte, _PAGE_PSE);
@@ -402,9 +504,34 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pmd(v);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_RW);
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pmd_mksaveddirty(pmd);
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -431,12 +558,21 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pmd = pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pmd_mksaveddirty(pmd);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -454,11 +590,14 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_ACCESSED);
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite_novma(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_RW);
}
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+#define pmd_mkwrite pmd_mkwrite
+
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
pudval_t v = native_pud_val(pud);
@@ -473,6 +612,24 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_mksaveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = mksaveddirty_shift(v);
+ return native_make_pud(v);
+}
+
+/* See comments above mksaveddirty_shift() */
+static inline pud_t pud_clear_saveddirty(pud_t pud)
+{
+ pudval_t v = native_pud_val(pud);
+
+ v = clear_saveddirty_shift(v);
+ return native_make_pud(v);
+}
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -480,17 +637,26 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_RW);
+ pud = pud_clear_flags(pud, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ return pud_mksaveddirty(pud);
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pud = pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+
+ return pud_mksaveddirty(pud);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -510,7 +676,9 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_RW);
+ pud = pud_set_flags(pud, _PAGE_RW);
+
+ return pud_clear_saveddirty(pud);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -627,6 +795,7 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte), oldval = val;
+ pte_t pte_result;
/*
* Chop off the NX bit (if present), and add the NX portion of
@@ -635,17 +804,54 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
- return __pte(val);
+
+ pte_result = __pte(val);
+
+ /*
+ * To avoid creating Write=0,Dirty=1 PTEs, pte_modify() needs to avoid:
+ * 1. Marking Write=0 PTEs Dirty=1
+ * 2. Marking Dirty=1 PTEs Write=0
+ *
+ * The first case cannot happen because the _PAGE_CHG_MASK will filter
+ * out any Dirty bit passed in newprot. Handle the second case by
+ * going through the mksaveddirty exercise. Only do this if the old
+ * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ */
+ if (oldval & _PAGE_RW)
+ pte_result = pte_mksaveddirty(pte_result);
+ else
+ pte_result = pte_clear_saveddirty(pte_result);
+
+ return pte_result;
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmdval_t val = pmd_val(pmd), oldval = val;
+ pmd_t pmd_result;
- val &= _HPAGE_CHG_MASK;
+ val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
- return __pmd(val);
+
+ pmd_result = __pmd(val);
+
+ /*
+ * To avoid creating Write=0,Dirty=1 PMDs, pte_modify() needs to avoid:
+ * 1. Marking Write=0 PMDs Dirty=1
+ * 2. Marking Dirty=1 PMDs Write=0
+ *
+ * The first case cannot happen because the _PAGE_CHG_MASK will filter
+ * out any Dirty bit passed in newprot. Handle the second case by
+ * going through the mksaveddirty exercise. Only do this if the old
+ * value was Write=1 to avoid doing this on Shadow Stack PTEs.
+ */
+ if (oldval & _PAGE_RW)
+ pmd_result = pmd_mksaveddirty(pmd_result);
+ else
+ pmd_result = pmd_clear_saveddirty(pmd_result);
+
+ return pmd_result;
}
/*
@@ -829,7 +1035,14 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* (Currently stuck as a macro because of indirect forward reference
* to linux/mm.h:page_to_nid())
*/
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+#define mk_pte(page, pgprot) \
+({ \
+ pgprot_t __pgprot = pgprot; \
+ \
+ WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
+ _PAGE_DIRTY); \
+ pfn_pte(page_to_pfn(page), __pgprot); \
+})
static inline int pmd_bad(pmd_t pmd)
{
@@ -1020,24 +1233,17 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
return res;
}
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- page_table_check_pte_set(mm, addr, ptep, pte);
- set_pte(ptep, pte);
-}
-
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(mm, addr, pmdp, pmd);
+ page_table_check_pmd_set(mm, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- page_table_check_pud_set(mm, addr, pudp, pud);
+ page_table_check_pud_set(mm, pudp, pud);
native_set_pud(pudp, pud);
}
@@ -1068,7 +1274,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
- page_table_check_pte_clear(mm, addr, pte);
+ page_table_check_pte_clear(mm, pte);
return pte;
}
@@ -1084,7 +1290,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
- page_table_check_pte_clear(mm, addr, pte);
+ page_table_check_pte_clear(mm, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
@@ -1095,7 +1301,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pte_t old_pte, new_pte;
+
+ old_pte = READ_ONCE(*ptep);
+ do {
+ new_pte = pte_wrprotect(old_pte);
+ } while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
}
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
@@ -1121,19 +1337,13 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define pmd_write pmd_write
-static inline int pmd_write(pmd_t pmd)
-{
- return pmd_flags(pmd) & _PAGE_RW;
-}
-
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
- page_table_check_pmd_clear(mm, addr, pmd);
+ page_table_check_pmd_clear(mm, pmd);
return pmd;
}
@@ -1144,7 +1354,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
{
pud_t pud = native_pudp_get_and_clear(pudp);
- page_table_check_pud_clear(mm, addr, pud);
+ page_table_check_pud_clear(mm, pud);
return pud;
}
@@ -1153,13 +1363,17 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
-}
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ pmd_t old_pmd, new_pmd;
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
- return pud_flags(pud) & _PAGE_RW;
+ old_pmd = READ_ONCE(*pmdp);
+ do {
+ new_pmd = pmd_wrprotect(old_pmd);
+ } while (!try_cmpxchg((long *)pmdp, (long *)&old_pmd, *(long *)&new_pmd));
}
#ifndef pmdp_establish
@@ -1167,7 +1381,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
- page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
+ page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
@@ -1292,6 +1506,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
}
+static inline void update_mmu_cache_range(struct vm_fault *vmf,
+ struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+}
static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
{
@@ -1412,6 +1631,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+ /*
+ * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
+ * shouldn't generally allow access to, but since they
+ * are already Write=0, the below logic covers both cases.
+ */
if (write)
need_pte_bits |= _PAGE_RW;
@@ -1453,6 +1677,12 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#define arch_check_zapped_pte arch_check_zapped_pte
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
+
+#define arch_check_zapped_pmd arch_check_zapped_pmd
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);
+
#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index ba3e2554799a..0b748ee16b3d 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -21,7 +21,8 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
+#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
@@ -34,6 +35,13 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#ifdef CONFIG_X86_64
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */
+#else
+/* Shared with _PAGE_BIT_UFFD_WP which is not supported on 32 bit */
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW2 /* Saved Dirty bit */
+#endif
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -117,6 +125,18 @@
#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
+/*
+ * The hardware requires shadow stack to be Write=0,Dirty=1. However,
+ * there are valid cases where the kernel might create read-only PTEs that
+ * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
+ * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
+ * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
+ * (Write=0,SavedDirty=1,Dirty=0) set.
+ */
+#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
/*
@@ -125,11 +145,12 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
- _PAGE_UFFD_WP)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+#define _COMMON_PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | \
+ _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY | \
+ _PAGE_DEVMAP | _PAGE_ENC | _PAGE_UFFD_WP)
+#define _PAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
+#define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
/*
* The cache modes defined here are used to translate between pure SW usage
@@ -188,14 +209,22 @@ enum page_cache_mode {
#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
+
+/*
+ * Page tables needs to have Write=1 in order for any lower PTEs to be
+ * writable. This includes shadow stack memory (Write=0, Dirty=1)
+ */
#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
-#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G)
+
+#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G)
+#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G)
+#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
-#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G)
#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 861e53e201e9..0086920cda06 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -28,6 +28,7 @@ struct vm86;
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>
+#include <asm/shstk.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -190,7 +191,6 @@ static inline unsigned long long l1tf_pfn_limit(void)
}
extern void early_cpu_init(void);
-extern void identify_boot_cpu(void);
extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
@@ -475,6 +475,13 @@ struct thread_struct {
*/
u32 pkru;
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ unsigned long features;
+ unsigned long features_locked;
+
+ struct thread_shstk shstk;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index 7fa611216417..4b081e0d3306 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -2,12 +2,7 @@
#ifndef _ASM_X86_RMWcc
#define _ASM_X86_RMWcc
-/* This counts to 12. Any more, it will return 13th argument. */
-#define __RMWcc_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
-#define RMWcc_ARGS(X...) __RMWcc_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
-
-#define __RMWcc_CONCAT(a, b) a ## b
-#define RMWcc_CONCAT(a, b) __RMWcc_CONCAT(a, b)
+#include <linux/args.h>
#define __CLOBBERS_MEM(clb...) "memory", ## clb
@@ -48,7 +43,7 @@ cc_label: c = true; \
#define GEN_UNARY_RMWcc_3(op, var, cc) \
GEN_UNARY_RMWcc_4(op, var, cc, "%[var]")
-#define GEN_UNARY_RMWcc(X...) RMWcc_CONCAT(GEN_UNARY_RMWcc_, RMWcc_ARGS(X))(X)
+#define GEN_UNARY_RMWcc(X...) CONCATENATE(GEN_UNARY_RMWcc_, COUNT_ARGS(X))(X)
#define GEN_BINARY_RMWcc_6(op, var, cc, vcon, _val, arg0) \
__GEN_RMWcc(op " %[val], " arg0, var, cc, \
@@ -57,7 +52,7 @@ cc_label: c = true; \
#define GEN_BINARY_RMWcc_5(op, var, cc, vcon, val) \
GEN_BINARY_RMWcc_6(op, var, cc, vcon, val, "%[var]")
-#define GEN_BINARY_RMWcc(X...) RMWcc_CONCAT(GEN_BINARY_RMWcc_, RMWcc_ARGS(X))(X)
+#define GEN_BINARY_RMWcc(X...) CONCATENATE(GEN_BINARY_RMWcc_, COUNT_ARGS(X))(X)
#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, cc, clobbers...) \
__GEN_RMWcc(op " %[var]\n\t" suffix, var, cc, \
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index a6e8373a5170..3fa87e5e11ab 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,8 +2,6 @@
#ifndef _ASM_X86_SECTIONS_H
#define _ASM_X86_SECTIONS_H
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
#include <asm-generic/sections.h>
#include <asm/extable.h>
@@ -18,20 +16,4 @@ extern char __end_of_kernel_reserve[];
extern unsigned long _brk_start, _brk_end;
-static inline bool arch_is_kernel_initmem_freed(unsigned long addr)
-{
- /*
- * If _brk_start has not been cleared, brk allocation is incomplete,
- * and we can not make assumptions about its use.
- */
- if (_brk_start)
- return 0;
-
- /*
- * After brk allocation is complete, space between _brk_end and _end
- * is available for allocation.
- */
- return addr >= _brk_end && addr < (unsigned long)&_end;
-}
-
#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
new file mode 100644
index 000000000000..42fee8959df7
--- /dev/null
+++ b/arch/x86/include/asm/shstk.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHSTK_H
+#define _ASM_X86_SHSTK_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+struct ksignal;
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+struct thread_shstk {
+ u64 base;
+ u64 size;
+};
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
+void reset_thread_features(void);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
+ unsigned long stack_size);
+void shstk_free(struct task_struct *p);
+int setup_signal_shadow_stack(struct ksignal *ksig);
+int restore_signal_shadow_stack(void);
+#else
+static inline long shstk_prctl(struct task_struct *task, int option,
+ unsigned long arg2) { return -EINVAL; }
+static inline void reset_thread_features(void) {}
+static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+ unsigned long clone_flags,
+ unsigned long stack_size) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
+static inline int restore_signal_shadow_stack(void) { return 0; }
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SHSTK_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 600cf25dbfc6..ad98dd1d9cfb 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -22,10 +22,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
-DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
-DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
-#endif
struct task_struct;
@@ -132,11 +128,8 @@ void smp_kick_mwait_play_dead(void);
void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
void native_send_call_func_single_ipi(int cpu);
-void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle);
bool smp_park_other_cpus_in_init(void);
-
-void smp_store_boot_cpu_info(void);
void smp_store_cpu_info(int id);
asmlinkage __visible void smp_reboot_interrupt(void);
@@ -186,13 +179,6 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
extern unsigned disabled_cpus;
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int hard_smp_processor_id(void);
-
-#else /* CONFIG_X86_LOCAL_APIC */
-#define hard_smp_processor_id() 0
-#endif /* CONFIG_X86_LOCAL_APIC */
-
#ifdef CONFIG_DEBUG_NMI_SELFTEST
extern void nmi_selftest(void);
#else
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index de48d1389936..d6cd9344f6c7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static inline int write_user_shstk_64(u64 __user *addr, u64 val)
+{
+ asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "r" (addr), [val] "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EFAULT;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
#define nop() asm volatile ("nop")
static inline void serialize(void)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 80450e1d5385..25726893c6f4 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -3,6 +3,7 @@
#define _ASM_X86_TLBFLUSH_H
#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <asm/processor.h>
@@ -253,6 +254,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
}
+static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
+{
+ bool should_defer = false;
+
+ /* If remote CPUs need to be flushed then defer batch the flush */
+ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+ should_defer = true;
+ put_cpu();
+
+ return should_defer;
+}
+
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
@@ -264,11 +277,18 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
return atomic64_inc_return(&mm->context.tlb_gen);
}
-static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm)
+static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+ struct mm_struct *mm,
+ unsigned long uaddr)
{
inc_mm_tlb_gen(mm);
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
+static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ flush_tlb_mm(mm);
}
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
@@ -286,7 +306,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags,
const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
_PAGE_ACCESSED;
const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
- _PAGE_SOFTW3 | _PAGE_SOFTW4;
+ _PAGE_SOFTW3 | _PAGE_SOFTW4 |
+ _PAGE_SAVED_DIRTY;
const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
_PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
_PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 10b1de500ab1..afa524325e55 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
* bit 15 == 1: SGX MMU page-fault
*/
enum x86_pf_error_code {
@@ -20,6 +21,7 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
X86_PF_SGX = 1 << 15,
};
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..b1c9cea6ba88 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -18,7 +18,8 @@ void __init trap_init(void);
asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
-extern bool ibt_selftest(void);
+extern int ibt_selftest(void);
+extern int ibt_selftest_noendbr(void);
#ifdef CONFIG_X86_F00F_BUG
/* For handling the FOOF bug */
@@ -47,4 +48,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
struct stack_info *info);
#endif
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 81b826d3b753..f2c02e4469cc 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -116,7 +116,7 @@ copy_user_generic(void *to, const void *from, unsigned long len)
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
- : : "memory", "rax", "r8", "r9", "r10", "r11");
+ : : "memory", "rax");
clac();
return len;
}
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index fa9ec20783fa..85e63d58c074 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -295,7 +295,10 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
/* VIRT <-> MACHINE conversion */
#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
+static inline unsigned long virt_to_pfn(const void *v)
+{
+ return PFN_DOWN(__pa(v));
+}
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
index 77a2d19cc990..abde0f44df57 100644
--- a/arch/x86/include/asm/xen/swiotlb-xen.h
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -2,12 +2,6 @@
#ifndef _ASM_X86_SWIOTLB_XEN_H
#define _ASM_X86_SWIOTLB_XEN_H
-#ifdef CONFIG_SWIOTLB_XEN
-extern int pci_xen_swiotlb_init_late(void);
-#else
-static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
-#endif
-
int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
unsigned int address_bits,
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 775dbd3aff73..8148bdddbd2c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,6 +3,7 @@
#define _ASM_X86_MMAN_H
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+#define MAP_ABOVE4G 0x80 /* only map above 4GB */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define arch_calc_vm_prot_bits(prot, key) ( \
@@ -12,6 +13,9 @@
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
#endif
+/* Flags for map_shadow_stack(2) */
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
+
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index e8d7ebbca1a4..384e2cc6ac19 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -23,9 +23,21 @@
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+/* Don't use 0x3001-0x3004 because of old glibcs */
+
#define ARCH_GET_UNTAG_MASK 0x4001
#define ARCH_ENABLE_TAGGED_ADDR 0x4002
#define ARCH_GET_MAX_TAG_BITS 0x4003
#define ARCH_FORCE_TAGGED_SVA 0x4004
+#define ARCH_SHSTK_ENABLE 0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_LOCK 0x5003
+#define ARCH_SHSTK_UNLOCK 0x5004
+#define ARCH_SHSTK_STATUS 0x5005
+
+/* ARCH_SHSTK_ features bits */
+#define ARCH_SHSTK_SHSTK (1ULL << 0)
+#define ARCH_SHSTK_WRSS (1ULL << 1)
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4070a01c11b7..3269a0e23d3a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -33,11 +33,10 @@ KCSAN_SANITIZE := n
KMSAN_SANITIZE_head$(BITS).o := n
KMSAN_SANITIZE_nmi.o := n
-# If instrumentation of this dir is enabled, boot hangs during first second.
-# Probably could be more selective here, but note that files related to irqs,
-# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
-# non-deterministic coverage.
-KCOV_INSTRUMENT := n
+# If instrumentation of the following files is enabled, boot hangs during
+# first second.
+KCOV_INSTRUMENT_head$(BITS).o := n
+KCOV_INSTRUMENT_sev.o := n
CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
@@ -49,6 +48,7 @@ obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
+obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
@@ -145,6 +145,10 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+obj-$(CONFIG_X86_CET) += cet.o
+
+obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 53369c57751e..2a0ea38955df 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -170,7 +170,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
*/
static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
{
- unsigned int ver = 0;
int cpu;
if (id >= MAX_LOCAL_APIC) {
@@ -183,10 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return -EINVAL;
}
- if (boot_cpu_physical_apicid != -1U)
- ver = boot_cpu_apic_version;
-
- cpu = generic_processor_info(id, ver);
+ cpu = generic_processor_info(id);
if (cpu >= 0)
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
@@ -240,7 +236,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- if (!apic->apic_id_valid(apic_id)) {
+ if (!apic_id_valid(apic_id)) {
if (enabled)
pr_warn("x2apic entry ignored\n");
return 0;
@@ -1182,7 +1178,7 @@ static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
- acpi_wake_cpu_handler_update(acpi_wakeup_cpu);
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
return 0;
}
@@ -1279,7 +1275,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/*
* if "noapic" boot option, don't look for IO-APICs
*/
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
pr_info("Skipping IOAPIC probe due to 'noapic' option.\n");
return -ENODEV;
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a6fcaf16cdbf..2ee867d796d9 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -7,7 +7,7 @@
# In particualr, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index af49e24b46a4..760adac3d1a8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -63,6 +63,8 @@
#include <asm/irq_regs.h>
#include <asm/cpu.h>
+#include "local.h"
+
unsigned int num_processors;
unsigned disabled_cpus;
@@ -74,11 +76,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
u8 boot_cpu_apic_version __ro_after_init;
/*
- * The highest APIC ID seen during enumeration.
- */
-static unsigned int max_physical_apicid;
-
-/*
* Bitmask of physically existing CPUs:
*/
physid_mask_t phys_cpu_present_map;
@@ -104,26 +101,20 @@ static bool virt_ext_dest_id __ro_after_init;
/* For parallel bootup. */
unsigned long apic_mmio_base __ro_after_init;
+static inline bool apic_accessible(void)
+{
+ return x2apic_mode || apic_mmio_base;
+}
+
/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
#ifdef CONFIG_X86_32
-
-/*
- * On x86_32, the mapping between cpu and logical apicid may vary
- * depending on apic in use. The following early percpu variable is
- * used for the mapping. This is where the behaviors of x86_64 and 32
- * actually diverge. Let's keep it ugly for now.
- */
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
-
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase __ro_after_init;
@@ -179,8 +170,8 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-unsigned long mp_lapic_addr __ro_after_init;
-int disable_apic __ro_after_init;
+static unsigned long mp_lapic_addr __ro_after_init;
+bool apic_is_disabled __ro_after_init;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
static int disable_apic_timer __initdata;
/* Local APIC timer works in C2 */
@@ -206,8 +197,6 @@ unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
-static unsigned long apic_phys __ro_after_init;
-
/*
* Get the LAPIC version
*/
@@ -247,31 +236,7 @@ static int modern_apic(void)
*/
static void __init apic_disable(void)
{
- pr_info("APIC: switched to apic NOOP\n");
- apic = &apic_noop;
-}
-
-void native_apic_wait_icr_idle(void)
-{
- while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
- cpu_relax();
-}
-
-u32 native_safe_apic_wait_icr_idle(void)
-{
- u32 send_status;
- int timeout;
-
- timeout = 0;
- do {
- send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
- if (!send_status)
- break;
- inc_irq_stat(icr_read_retry_count);
- udelay(100);
- } while (timeout++ < 1000);
-
- return send_status;
+ apic_install_driver(&apic_noop);
}
void native_apic_icr_write(u32 low, u32 id)
@@ -537,7 +502,7 @@ static int lapic_timer_set_oneshot(struct clock_event_device *evt)
static void lapic_timer_broadcast(const struct cpumask *mask)
{
#ifdef CONFIG_SMP
- apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+ __apic_send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
#endif
}
@@ -810,7 +775,7 @@ bool __init apic_needs_pit(void)
return true;
/* Is there an APIC at all or is it disabled? */
- if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic)
+ if (!boot_cpu_has(X86_FEATURE_APIC) || apic_is_disabled)
return true;
/*
@@ -1110,7 +1075,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- ack_APIC_irq();
+ apic_eoi();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
@@ -1134,8 +1099,7 @@ void clear_local_APIC(void)
int maxlvt;
u32 v;
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
+ if (!apic_accessible())
return;
maxlvt = lapic_get_maxlvt();
@@ -1225,8 +1189,7 @@ void apic_soft_disable(void)
*/
void disable_local_APIC(void)
{
- /* APIC hasn't been mapped yet */
- if (!x2apic_mode && !apic_phys)
+ if (!apic_accessible())
return;
apic_soft_disable();
@@ -1299,7 +1262,7 @@ enum apic_intr_mode_id apic_intr_mode __ro_after_init;
static int __init __apic_intr_mode_select(void)
{
/* Check kernel option */
- if (disable_apic) {
+ if (apic_is_disabled) {
pr_info("APIC disabled via kernel command line\n");
return APIC_PIC;
}
@@ -1308,7 +1271,7 @@ static int __init __apic_intr_mode_select(void)
#ifdef CONFIG_X86_64
/* On 64-bit, the APIC must be integrated, Check local APIC only */
if (!boot_cpu_has(X86_FEATURE_APIC)) {
- disable_apic = 1;
+ apic_is_disabled = true;
pr_info("APIC disabled by BIOS\n");
return APIC_PIC;
}
@@ -1317,16 +1280,15 @@ static int __init __apic_intr_mode_select(void)
/* Neither 82489DX nor integrated APIC ? */
if (!boot_cpu_has(X86_FEATURE_APIC) && !smp_found_config) {
- disable_apic = 1;
+ apic_is_disabled = true;
return APIC_PIC;
}
/* If the BIOS pretends there is an integrated APIC ? */
if (!boot_cpu_has(X86_FEATURE_APIC) &&
APIC_INTEGRATED(boot_cpu_apic_version)) {
- disable_apic = 1;
- pr_err(FW_BUG "Local APIC %d not detected, force emulation\n",
- boot_cpu_physical_apicid);
+ apic_is_disabled = true;
+ pr_err(FW_BUG "Local APIC not detected, force emulation\n");
return APIC_PIC;
}
#endif
@@ -1347,12 +1309,6 @@ static int __init __apic_intr_mode_select(void)
pr_info("APIC: SMP mode deactivated\n");
return APIC_SYMMETRIC_IO_NO_ROUTING;
}
-
- if (read_apic_id() != boot_cpu_physical_apicid) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- read_apic_id(), boot_cpu_physical_apicid);
- /* Or can we switch back to PIC here? */
- }
#endif
return APIC_SYMMETRIC_IO;
@@ -1439,7 +1395,9 @@ void __init apic_intr_mode_init(void)
break;
}
- default_setup_apic_routing();
+ x86_64_probe_apic();
+
+ x86_32_install_bigsmp();
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
@@ -1521,7 +1479,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
* per set bit.
*/
for_each_set_bit(bit, isr->map, APIC_IR_BITS)
- ack_APIC_irq();
+ apic_eoi();
return true;
}
@@ -1533,7 +1491,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
* interrupt from previous kernel might still have ISR bit set.
*
* Most probably by now the CPU has serviced that pending interrupt and it
- * might not have done the ack_APIC_irq() because it thought, interrupt
+ * might not have done the apic_eoi() because it thought, interrupt
* came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
* the ISR bit and cpu thinks it has already serviced the interrupt. Hence
* a vector might get locked. It was noticed for timer irq (vector
@@ -1567,7 +1525,7 @@ static void setup_local_APIC(void)
int cpu = smp_processor_id();
unsigned int value;
- if (disable_apic) {
+ if (apic_is_disabled) {
disable_ioapic_support();
return;
}
@@ -1589,36 +1547,18 @@ static void setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- /*
- * Double-check whether this APIC is really registered.
- * This is meaningless in clustered apic mode, so we skip it.
- */
- BUG_ON(!apic->apic_id_registered());
+ /* Validate that the APIC is registered if required */
+ BUG_ON(apic->apic_id_registered && !apic->apic_id_registered());
/*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
+ * document number 292116).
+ *
+ * Except for APICs which operate in physical destination mode.
*/
- apic->init_apic_ldr();
-
-#ifdef CONFIG_X86_32
- if (apic->dest_mode_logical) {
- int logical_apicid, ldr_apicid;
-
- /*
- * APIC LDR is initialized. If logical_apicid mapping was
- * initialized during get_smp_config(), make sure it matches
- * the actual value.
- */
- logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
- if (logical_apicid != BAD_APICID)
- WARN_ON(logical_apicid != ldr_apicid);
- /* Always use the value from LDR. */
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
- }
-#endif
+ if (apic->init_apic_ldr)
+ apic->init_apic_ldr();
/*
* Set Task Priority to 'accept all except vectors 0-31'. An APIC
@@ -1691,7 +1631,7 @@ static void setup_local_APIC(void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!cpu && (pic_mode || !value || skip_ioapic_setup)) {
+ if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
} else {
@@ -1748,6 +1688,25 @@ void apic_ap_setup(void)
end_local_APIC_setup();
}
+static __init void cpu_set_boot_apic(void);
+
+static __init void apic_read_boot_cpu_id(bool x2apic)
+{
+ /*
+ * This can be invoked from check_x2apic() before the APIC has been
+ * selected. But that code knows for sure that the BIOS enabled
+ * X2APIC.
+ */
+ if (x2apic) {
+ boot_cpu_physical_apicid = native_apic_msr_read(APIC_ID);
+ boot_cpu_apic_version = GET_APIC_VERSION(native_apic_msr_read(APIC_LVR));
+ } else {
+ boot_cpu_physical_apicid = read_apic_id();
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+ cpu_set_boot_apic();
+}
+
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
EXPORT_SYMBOL_GPL(x2apic_mode);
@@ -1847,6 +1806,8 @@ void x2apic_setup(void)
__x2apic_enable();
}
+static __init void apic_set_fixmap(void);
+
static __init void x2apic_disable(void)
{
u32 x2apic_id, state = x2apic_state;
@@ -1867,7 +1828,7 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
- register_lapic_address(mp_lapic_addr);
+ apic_set_fixmap();
}
static __init void x2apic_enable(void)
@@ -1928,6 +1889,7 @@ void __init check_x2apic(void)
x2apic_state = X2APIC_ON_LOCKED;
else
x2apic_state = X2APIC_ON;
+ apic_read_boot_cpu_id(true);
} else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
x2apic_state = X2APIC_DISABLED;
}
@@ -1943,7 +1905,7 @@ void __init check_x2apic(void)
pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
pr_err("Disabling APIC, expect reduced performance and functionality.\n");
- disable_apic = 1;
+ apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
@@ -1956,7 +1918,7 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
int ret, ir_stat;
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return;
}
@@ -1994,19 +1956,19 @@ void __init enable_IR_x2apic(void)
* On AMD64 we trust the BIOS - if it says no APIC it is likely
* not correctly set up (usually the APIC timer won't work etc.)
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
if (!boot_cpu_has(X86_FEATURE_APIC)) {
pr_info("No local APIC present\n");
- return -1;
+ return false;
}
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- return 0;
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
+ return true;
}
#else
-static int __init apic_verify(void)
+static bool __init apic_verify(unsigned long addr)
{
u32 features, h, l;
@@ -2017,28 +1979,28 @@ static int __init apic_verify(void)
features = cpuid_edx(1);
if (!(features & (1 << X86_FEATURE_APIC))) {
pr_warn("Could not enable APIC!\n");
- return -1;
+ return false;
}
set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
/* The BIOS may have set up the APIC at some other address */
if (boot_cpu_data.x86 >= 6) {
rdmsr(MSR_IA32_APICBASE, l, h);
if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+ addr = l & MSR_IA32_APICBASE_BASE;
}
+ register_lapic_address(addr);
pr_info("Found and enabled local APIC!\n");
- return 0;
+ return true;
}
-int __init apic_force_enable(unsigned long addr)
+bool __init apic_force_enable(unsigned long addr)
{
u32 h, l;
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
/*
* Some BIOSes disable the local APIC in the APIC_BASE
@@ -2055,17 +2017,17 @@ int __init apic_force_enable(unsigned long addr)
enabled_via_apicbase = 1;
}
}
- return apic_verify();
+ return apic_verify(addr);
}
/*
* Detect and initialize APIC
*/
-static int __init detect_init_APIC(void)
+static bool __init detect_init_APIC(void)
{
/* Disabled by kernel option? */
- if (disable_apic)
- return -1;
+ if (apic_is_disabled)
+ return false;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -2092,22 +2054,22 @@ static int __init detect_init_APIC(void)
if (!force_enable_local_apic) {
pr_info("Local APIC disabled by BIOS -- "
"you can enable it with \"lapic\"\n");
- return -1;
+ return false;
}
- if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
- return -1;
+ if (!apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return false;
} else {
- if (apic_verify())
- return -1;
+ if (!apic_verify(APIC_DEFAULT_PHYS_BASE))
+ return false;
}
apic_pm_activate();
- return 0;
+ return true;
no_apic:
pr_info("No local APIC present or hardware disabled\n");
- return -1;
+ return false;
}
#endif
@@ -2116,64 +2078,38 @@ no_apic:
*/
void __init init_apic_mappings(void)
{
- unsigned int new_apicid;
-
if (apic_validate_deadline_timer())
pr_info("TSC deadline timer available\n");
- if (x2apic_mode) {
- boot_cpu_physical_apicid = read_apic_id();
+ if (x2apic_mode)
return;
- }
- /* If no local APIC can be found return early */
- if (!smp_found_config && detect_init_APIC()) {
- /* lets NOP'ify apic operations */
- pr_info("APIC: disable apic facility\n");
- apic_disable();
- } else {
- apic_phys = mp_lapic_addr;
-
- /*
- * If the system has ACPI MADT tables or MP info, the LAPIC
- * address is already registered.
- */
- if (!acpi_lapic && !smp_found_config)
- register_lapic_address(apic_phys);
+ if (!smp_found_config) {
+ if (!detect_init_APIC()) {
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ }
+ num_processors = 1;
}
+}
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- new_apicid = read_apic_id();
- if (boot_cpu_physical_apicid != new_apicid) {
- boot_cpu_physical_apicid = new_apicid;
- /*
- * yeah -- we lie about apic_version
- * in case if apic was disabled via boot option
- * but it's not a problem for SMP compiled kernel
- * since apic_intr_mode_select is prepared for such
- * a case and disable smp mode
- */
- boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+static __init void apic_set_fixmap(void)
+{
+ set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
+ apic_mmio_base = APIC_BASE;
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ apic_mmio_base, mp_lapic_addr);
+ apic_read_boot_cpu_id(false);
}
void __init register_lapic_address(unsigned long address)
{
+ /* This should only happen once */
+ WARN_ON_ONCE(mp_lapic_addr);
mp_lapic_addr = address;
- if (!x2apic_mode) {
- set_fixmap_nocache(FIX_APIC_BASE, address);
- apic_mmio_base = APIC_BASE;
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, address);
- }
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
- }
+ if (!x2apic_mode)
+ apic_set_fixmap();
}
/*
@@ -2210,7 +2146,7 @@ static noinline void handle_spurious_interrupt(u8 vector)
if (v & (1 << (vector & 0x1f))) {
pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
vector, smp_processor_id());
- ack_APIC_irq();
+ apic_eoi();
} else {
pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
vector, smp_processor_id());
@@ -2261,7 +2197,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
- ack_APIC_irq();
+ apic_eoi();
atomic_inc(&irq_err_count);
apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
@@ -2446,54 +2382,43 @@ static int allocate_logical_cpuid(int apicid)
return nr_logical_cpuids++;
}
-int generic_processor_info(int apicid, int version)
+static void cpu_update_apic(int cpu, int apicid)
{
- int cpu, max = nr_cpu_ids;
- bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
- phys_cpu_present_map);
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+#endif
+ set_cpu_possible(cpu, true);
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ num_processors++;
- /*
- * boot_cpu_physical_apicid is designed to have the apicid
- * returned by read_apic_id(), i.e, the apicid of the
- * currently booting-up processor. However, on some platforms,
- * it is temporarily modified by the apicid reported as BSP
- * through MP table. Concretely:
- *
- * - arch/x86/kernel/mpparse.c: MP_processor_info()
- * - arch/x86/mm/amdtopology.c: amd_numa_init()
- *
- * This function is executed with the modified
- * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
- * parameter doesn't work to disable APs on kdump 2nd kernel.
- *
- * Since fixing handling of boot_cpu_physical_apicid requires
- * another discussion and tests on each platform, we leave it
- * for now and here we use read_apic_id() directly in this
- * function, generic_processor_info().
- */
- if (disabled_cpu_apicid != BAD_APICID &&
- disabled_cpu_apicid != read_apic_id() &&
- disabled_cpu_apicid == apicid) {
- int thiscpu = num_processors + disabled_cpus;
+ if (system_state != SYSTEM_BOOTING)
+ cpu_mark_primary_thread(cpu, apicid);
+}
- pr_warn("APIC: Disabling requested cpu."
- " Processor %d/0x%x ignored.\n", thiscpu, apicid);
+static __init void cpu_set_boot_apic(void)
+{
+ cpuid_to_apicid[0] = boot_cpu_physical_apicid;
+ cpu_update_apic(0, boot_cpu_physical_apicid);
+ x86_32_probe_bigsmp_early();
+}
- disabled_cpus++;
- return -ENODEV;
- }
+int generic_processor_info(int apicid)
+{
+ int cpu, max = nr_cpu_ids;
- /*
- * If boot cpu has not been detected yet, then only allow upto
- * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
- */
- if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
- apicid != boot_cpu_physical_apicid) {
- int thiscpu = max + disabled_cpus - 1;
+ /* The boot CPU must be set before MADT/MPTABLE parsing happens */
+ if (cpuid_to_apicid[0] == BAD_APICID)
+ panic("Boot CPU APIC not registered yet\n");
- pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost"
- " reached. Keeping one slot for boot cpu."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ if (apicid == boot_cpu_physical_apicid)
+ return 0;
+
+ if (disabled_cpu_apicid == apicid) {
+ int thiscpu = num_processors + disabled_cpus;
+
+ pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
disabled_cpus++;
return -ENODEV;
@@ -2509,66 +2434,16 @@ int generic_processor_info(int apicid, int version)
return -EINVAL;
}
- if (apicid == boot_cpu_physical_apicid) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- * boot_cpu_init() already hold bit 0 in cpu_present_mask
- * for BSP.
- */
- cpu = 0;
-
- /* Logical cpuid 0 is reserved for BSP. */
- cpuid_to_apicid[0] = apicid;
- } else {
- cpu = allocate_logical_cpuid(apicid);
- if (cpu < 0) {
- disabled_cpus++;
- return -EINVAL;
- }
- }
-
- /*
- * Validate version
- */
- if (version == 0x0) {
- pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
- cpu, apicid);
- version = 0x10;
- }
-
- if (version != boot_cpu_apic_version) {
- pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
- boot_cpu_apic_version, cpu, version);
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
}
- if (apicid > max_physical_apicid)
- max_physical_apicid = apicid;
-
-#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
- early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-#endif
-#ifdef CONFIG_X86_32
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
- apic->x86_32_early_logical_apicid(cpu);
-#endif
- set_cpu_possible(cpu, true);
- physid_set(apicid, phys_cpu_present_map);
- set_cpu_present(cpu, true);
- num_processors++;
-
- if (system_state != SYSTEM_BOOTING)
- cpu_mark_primary_thread(cpu, apicid);
-
+ cpu_update_apic(cpu, apicid);
return cpu;
}
-int hard_smp_processor_id(void)
-{
- return read_apic_id();
-}
void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
bool dmar)
@@ -2610,47 +2485,10 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
}
EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
-#ifdef CONFIG_X86_64
-void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler)
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++)
- (*drv)->wakeup_secondary_cpu_64 = handler;
-}
-#endif
-
-/*
- * Override the generic EOI implementation with an optimized version.
- * Only called during early boot when only one CPU is active and with
- * interrupts disabled, so we know this does not race with actual APIC driver
- * use.
- */
-void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- /* Should happen once for each apic */
- WARN_ON((*drv)->eoi_write == eoi_write);
- (*drv)->native_eoi_write = (*drv)->eoi_write;
- (*drv)->eoi_write = eoi_write;
- }
-}
-
static void __init apic_bsp_up_setup(void)
{
#ifdef CONFIG_X86_64
apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
-#else
- /*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
- */
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
#endif
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
}
@@ -2919,7 +2757,7 @@ int apic_is_clustered_box(void)
*/
static int __init setup_disableapic(char *arg)
{
- disable_apic = 1;
+ apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
@@ -2956,11 +2794,11 @@ early_param("nolapic_timer", parse_nolapic_timer);
static int __init apic_set_verbosity(char *arg)
{
if (!arg) {
-#ifdef CONFIG_X86_64
- skip_ioapic_setup = 0;
+ if (IS_ENABLED(CONFIG_X86_32))
+ return -EINVAL;
+
+ ioapic_is_disabled = false;
return 0;
-#endif
- return -EINVAL;
}
if (strcmp("debug", arg) == 0)
@@ -2981,11 +2819,11 @@ early_param("apic", apic_set_verbosity);
static int __init lapic_insert_resource(void)
{
- if (!apic_phys)
+ if (!apic_mmio_base)
return -1;
/* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
+ lapic_resource.start = apic_mmio_base;
lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
insert_resource(&iomem_resource, &lapic_resource);
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 02b4839478b1..7bc5d9bf59cd 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -6,6 +6,8 @@
#include <linux/irq.h>
#include <asm/apic.h>
+#include "local.h"
+
u32 apic_default_calc_apicid(unsigned int cpu)
{
return per_cpu(x86_cpu_to_apicid, cpu);
@@ -29,18 +31,27 @@ void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
int default_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ return (int)per_cpu(x86_cpu_to_apicid, mps_cpu);
else
return BAD_APICID;
}
EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
-int default_check_phys_apicid_present(int phys_apicid)
+bool default_apic_id_registered(void)
{
- return physid_isset(phys_apicid, phys_cpu_present_map);
+ return physid_isset(read_apic_id(), phys_cpu_present_map);
}
-int default_apic_id_valid(u32 apicid)
+/*
+ * Set up the logical destination ID when the APIC operates in logical
+ * destination mode.
+ */
+void default_init_apic_ldr(void)
{
- return (apicid < 255);
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 8f72b4351c9f..032a84e2c3cc 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -28,26 +28,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LDR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-void flat_init_apic_ldr(void)
-{
- unsigned long val;
- unsigned long num, id;
-
- num = smp_processor_id();
- id = 1UL << num;
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
static void _flat_send_IPI_mask(unsigned long mask, int vector)
{
unsigned long flags;
@@ -86,16 +66,6 @@ static u32 set_apic_id(unsigned int id)
return (id & 0xFF) << 24;
}
-static unsigned int read_xapic_id(void)
-{
- return flat_get_apic_id(apic_read(APIC_ID));
-}
-
-static int flat_apic_id_registered(void)
-{
- return physid_isset(read_xapic_id(), phys_cpu_present_map);
-}
-
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
@@ -110,23 +80,18 @@ static struct apic apic_flat __ro_after_init = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = flat_apic_id_registered,
+ .apic_id_registered = default_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = flat_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
+ .init_apic_ldr = default_init_apic_ldr,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = flat_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -139,15 +104,13 @@ static struct apic apic_flat __ro_after_init = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
/*
@@ -178,22 +141,9 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static void physflat_init_apic_ldr(void)
-{
- /*
- * LDR and DFR are not involved in physflat mode, rather:
- * "In physical destination mode, the destination processor is
- * specified by its local APIC ID [...]." (Intel SDM, 10.6.2.1)
- */
-}
-
static int physflat_probe(void)
{
- if (apic == &apic_physflat || num_possible_cpus() > 8 ||
- jailhouse_paravirt())
- return 1;
-
- return 0;
+ return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt();
}
static struct apic apic_physflat __ro_after_init = {
@@ -201,8 +151,7 @@ static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = flat_apic_id_registered,
+ .apic_id_registered = default_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
@@ -210,14 +159,11 @@ static struct apic apic_physflat __ro_after_init = {
.disable_esr = 0,
.check_apicid_used = NULL,
- .init_apic_ldr = physflat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = flat_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -230,15 +176,13 @@ static struct apic apic_physflat __ro_after_init = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
/*
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index fe78319e0f7a..966d7cf10b95 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -8,92 +8,42 @@
* Though in case if apic is disabled (for some reason) we try
* to not uglify the caller's code and allow to call (some) apic routines
* like self-ipi, etc...
+ *
+ * FIXME: Remove this gunk. The above argument which was intentionally left
+ * in place is silly to begin with because none of the callbacks except for
+ * APIC::read/write() have a WARN_ON_ONCE() in them. Sigh...
*/
#include <linux/cpumask.h>
#include <linux/thread_info.h>
#include <asm/apic.h>
-static void noop_init_apic_ldr(void) { }
static void noop_send_IPI(int cpu, int vector) { }
static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
-static void noop_apic_wait_icr_idle(void) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-
-static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
-{
- return -1;
-}
-
-static u32 noop_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
-static u64 noop_apic_icr_read(void)
-{
- return 0;
-}
-
-static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return 0;
-}
-
-static unsigned int noop_get_apic_id(unsigned long x)
-{
- return 0;
-}
-
-static int noop_probe(void)
-{
- /*
- * NOOP apic should not ever be
- * enabled via probe routine
- */
- return 0;
-}
-
-static int noop_apic_id_registered(void)
-{
- /*
- * if we would be really "pedantic"
- * we should pass read_apic_id() here
- * but since NOOP suppose APIC ID = 0
- * lets save a few cycles
- */
- return physid_isset(0, phys_cpu_present_map);
-}
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { return -1; }
+static u64 noop_apic_icr_read(void) { return 0; }
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; }
+static unsigned int noop_get_apic_id(unsigned long x) { return 0; }
+static void noop_apic_eoi(void) { }
static u32 noop_apic_read(u32 reg)
{
- WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
return 0;
}
-static void noop_apic_write(u32 reg, u32 v)
-{
- WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
-}
-
-#ifdef CONFIG_X86_32
-static int noop_x86_32_early_logical_apicid(int cpu)
+static void noop_apic_write(u32 reg, u32 val)
{
- return BAD_APICID;
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !apic_is_disabled);
}
-#endif
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .probe = noop_probe,
- .acpi_madt_oem_check = NULL,
-
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = noop_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
@@ -101,18 +51,13 @@ struct apic apic_noop __ro_after_init = {
.disable_esr = 0,
.check_apicid_used = default_check_apicid_used,
- .init_apic_ldr = noop_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
-
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = noop_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = noop_get_apic_id,
- .set_apic_id = NULL,
.calc_dest_apicid = apic_flat_calc_apicid,
@@ -125,17 +70,9 @@ struct apic apic_noop __ro_after_init = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- .inquire_remote_apic = NULL,
-
.read = noop_apic_read,
.write = noop_apic_write,
- .eoi_write = noop_apic_write,
+ .eoi = noop_apic_eoi,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
- .wait_icr_idle = noop_apic_wait_icr_idle,
- .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
-
-#ifdef CONFIG_X86_32
- .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
-#endif
};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a54d817eb4b6..63f3d7be9dc7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,17 +56,6 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
}
-static int numachip_apic_id_valid(u32 apicid)
-{
- /* Trust what bootloader passes in MADT */
- return 1;
-}
-
-static int numachip_apic_id_registered(void)
-{
- return 1;
-}
-
static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
@@ -228,38 +217,20 @@ static int numachip2_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 1;
}
-/* APIC IPIs are queued */
-static void numachip_apic_wait_icr_idle(void)
-{
-}
-
-/* APIC NMI IPIs are queued */
-static u32 numachip_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
static const struct apic apic_numachip1 __refconst = {
.name = "NumaConnect system",
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .apic_id_valid = numachip_apic_id_valid,
- .apic_id_registered = numachip_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = flat_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = numachip_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = numachip1_get_apic_id,
.set_apic_id = numachip1_set_apic_id,
@@ -273,15 +244,12 @@ static const struct apic apic_numachip1 __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = numachip_apic_wait_icr_idle,
- .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
};
apic_driver(apic_numachip1);
@@ -290,23 +258,16 @@ static const struct apic apic_numachip2 __refconst = {
.name = "NumaConnect2 system",
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .apic_id_valid = numachip_apic_id_valid,
- .apic_id_registered = numachip_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = flat_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = numachip_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = numachip2_get_apic_id,
.set_apic_id = numachip2_set_apic_id,
@@ -320,15 +281,12 @@ static const struct apic apic_numachip2 __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = numachip_apic_wait_icr_idle,
- .safe_wait_icr_idle = numachip_safe_apic_wait_icr_idle,
};
apic_driver(apic_numachip2);
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77555f66c14d..0e5535add4b5 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -18,56 +18,17 @@ static unsigned bigsmp_get_apic_id(unsigned long x)
return (x >> 24) & 0xFF;
}
-static int bigsmp_apic_id_registered(void)
-{
- return 1;
-}
-
static bool bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
{
return false;
}
-static int bigsmp_early_logical_apicid(int cpu)
-{
- /* on bigsmp, logical apicid is the same as physical */
- return early_per_cpu(x86_cpu_to_apicid, cpu);
-}
-
-/*
- * bigsmp enables physical destination mode
- * and doesn't use LDR and DFR
- */
-static void bigsmp_init_apic_ldr(void)
-{
-}
-
-static void bigsmp_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int bigsmp_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
-
- return BAD_APICID;
-}
-
static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
/* For clustered we don't have a good way to do this yet - hack */
physids_promote(0xFFL, retmap);
}
-static int bigsmp_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
@@ -111,21 +72,13 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
static int probe_bigsmp(void)
{
- if (def_to_bigsmp)
- dmi_bigsmp = 1;
- else
- dmi_check_system(bigsmp_dmi_table);
-
- return dmi_bigsmp;
+ return dmi_check_system(bigsmp_dmi_table);
}
static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
- .acpi_madt_oem_check = NULL,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = bigsmp_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
@@ -133,14 +86,11 @@ static struct apic apic_bigsmp __ro_after_init = {
.disable_esr = 1,
.check_apicid_used = bigsmp_check_apicid_used,
- .init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
- .setup_apic_routing = bigsmp_setup_apic_routing,
- .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
.phys_pkg_id = bigsmp_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
@@ -153,37 +103,24 @@ static struct apic apic_bigsmp __ro_after_init = {
.send_IPI_all = bigsmp_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
-void __init generic_bigsmp_probe(void)
+bool __init apic_bigsmp_possible(bool cmdline_override)
{
- unsigned int cpu;
-
- if (!probe_bigsmp())
- return;
-
- apic = &apic_bigsmp;
-
- for_each_possible_cpu(cpu) {
- if (early_per_cpu(x86_cpu_to_logical_apicid,
- cpu) == BAD_APICID)
- continue;
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
- bigsmp_early_logical_apicid(cpu);
- }
+ return apic == &apic_bigsmp || !cmdline_override;
+}
- pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
+void __init apic_bigsmp_force(void)
+{
+ if (apic != &apic_bigsmp)
+ apic_install_driver(&apic_bigsmp);
}
apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 34a992e275ef..45af535c44a0 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -21,6 +21,8 @@
#include <linux/init.h>
#include <linux/delay.h>
+#include "local.h"
+
#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
@@ -31,12 +33,12 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#ifdef arch_trigger_cpumask_backtrace
static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
- apic->send_IPI_mask(mask, NMI_VECTOR);
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
}
-void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
{
- nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_trigger_cpumask_backtrace(mask, exclude_cpu,
nmi_raise_cpu_backtrace);
}
diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c
new file mode 100644
index 000000000000..821e2e536f19
--- /dev/null
+++ b/arch/x86/kernel/apic/init.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) "APIC: " fmt
+
+#include <asm/apic.h>
+
+#include "local.h"
+
+/*
+ * Use DEFINE_STATIC_CALL_NULL() to avoid having to provide stub functions
+ * for each callback. The callbacks are setup during boot and all except
+ * wait_icr_idle() must be initialized before usage. The IPI wrappers
+ * use static_call() and not static_call_cond() to catch any fails.
+ */
+#define DEFINE_APIC_CALL(__cb) \
+ DEFINE_STATIC_CALL_NULL(apic_call_##__cb, *apic->__cb)
+
+DEFINE_APIC_CALL(eoi);
+DEFINE_APIC_CALL(native_eoi);
+DEFINE_APIC_CALL(icr_read);
+DEFINE_APIC_CALL(icr_write);
+DEFINE_APIC_CALL(read);
+DEFINE_APIC_CALL(send_IPI);
+DEFINE_APIC_CALL(send_IPI_mask);
+DEFINE_APIC_CALL(send_IPI_mask_allbutself);
+DEFINE_APIC_CALL(send_IPI_allbutself);
+DEFINE_APIC_CALL(send_IPI_all);
+DEFINE_APIC_CALL(send_IPI_self);
+DEFINE_APIC_CALL(wait_icr_idle);
+DEFINE_APIC_CALL(wakeup_secondary_cpu);
+DEFINE_APIC_CALL(wakeup_secondary_cpu_64);
+DEFINE_APIC_CALL(write);
+
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask);
+EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self);
+
+/* The container for function call overrides */
+struct apic_override __x86_apic_override __initdata;
+
+#define apply_override(__cb) \
+ if (__x86_apic_override.__cb) \
+ apic->__cb = __x86_apic_override.__cb
+
+static __init void restore_override_callbacks(void)
+{
+ apply_override(eoi);
+ apply_override(native_eoi);
+ apply_override(write);
+ apply_override(read);
+ apply_override(send_IPI);
+ apply_override(send_IPI_mask);
+ apply_override(send_IPI_mask_allbutself);
+ apply_override(send_IPI_allbutself);
+ apply_override(send_IPI_all);
+ apply_override(send_IPI_self);
+ apply_override(icr_read);
+ apply_override(icr_write);
+ apply_override(wakeup_secondary_cpu);
+ apply_override(wakeup_secondary_cpu_64);
+}
+
+#define update_call(__cb) \
+ static_call_update(apic_call_##__cb, *apic->__cb)
+
+static __init void update_static_calls(void)
+{
+ update_call(eoi);
+ update_call(native_eoi);
+ update_call(write);
+ update_call(read);
+ update_call(send_IPI);
+ update_call(send_IPI_mask);
+ update_call(send_IPI_mask_allbutself);
+ update_call(send_IPI_allbutself);
+ update_call(send_IPI_all);
+ update_call(send_IPI_self);
+ update_call(icr_read);
+ update_call(icr_write);
+ update_call(wait_icr_idle);
+ update_call(wakeup_secondary_cpu);
+ update_call(wakeup_secondary_cpu_64);
+}
+
+void __init apic_setup_apic_calls(void)
+{
+ /* Ensure that the default APIC has native_eoi populated */
+ apic->native_eoi = apic->eoi;
+ update_static_calls();
+ pr_info("Static calls initialized\n");
+}
+
+void __init apic_install_driver(struct apic *driver)
+{
+ if (apic == driver)
+ return;
+
+ apic = driver;
+
+ if (IS_ENABLED(CONFIG_X86_X2APIC) && apic->x2apic_set_max_apicid)
+ apic->max_apic_id = x2apic_max_apicid;
+
+ /* Copy the original eoi() callback as KVM/HyperV might overwrite it */
+ if (!apic->native_eoi)
+ apic->native_eoi = apic->eoi;
+
+ /* Apply any already installed callback overrides */
+ restore_override_callbacks();
+ update_static_calls();
+
+ pr_info("Switched APIC routing to: %s\n", driver->name);
+}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4241dc243aa8..00da6cf6b07d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -178,7 +178,7 @@ int mp_bus_id_to_type[MAX_MP_BUSSES];
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int skip_ioapic_setup;
+bool ioapic_is_disabled __ro_after_init;
/**
* disable_ioapic_support() - disables ioapic support at runtime
@@ -189,7 +189,7 @@ void disable_ioapic_support(void)
noioapicquirk = 1;
noioapicreroute = -1;
#endif
- skip_ioapic_setup = 1;
+ ioapic_is_disabled = true;
}
static int __init parse_noapic(char *str)
@@ -831,7 +831,7 @@ static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
{
int ioapic, pin, idx;
- if (skip_ioapic_setup)
+ if (ioapic_is_disabled)
return -1;
ioapic = mp_find_ioapic(gsi);
@@ -1366,7 +1366,7 @@ void __init enable_IO_APIC(void)
int i8259_apic, i8259_pin;
int apic, pin;
- if (skip_ioapic_setup)
+ if (ioapic_is_disabled)
nr_ioapics = 0;
if (!nr_legacy_irqs() || !nr_ioapics)
@@ -1511,13 +1511,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
physid_set(i, phys_id_present_map);
ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- physid_mask_t tmp;
- apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
- &tmp);
- apic_printk(APIC_VERBOSE, "Setting %d in the "
- "phys_id_present_map\n",
- mpc_ioapic_id(ioapic_idx));
- physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
/*
@@ -1827,7 +1823,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
*/
- ack_APIC_irq();
+ apic_eoi();
/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
@@ -2050,7 +2046,7 @@ static void unmask_lapic_irq(struct irq_data *data)
static void ack_lapic_irq(struct irq_data *data)
{
- ack_APIC_irq();
+ apic_eoi();
}
static struct irq_chip lapic_chip __read_mostly = {
@@ -2095,7 +2091,7 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
- apic_id = hard_smp_processor_id();
+ apic_id = read_apic_id();
memset(&entry1, 0, sizeof(entry1));
entry1.dest_mode_logical = true;
@@ -2399,7 +2395,7 @@ void __init setup_IO_APIC(void)
{
int ioapic;
- if (skip_ioapic_setup || !nr_ioapics)
+ if (ioapic_is_disabled || !nr_ioapics)
return;
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
@@ -2546,7 +2542,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
apic_id = i;
}
- apic->apicid_to_cpu_present(apic_id, &tmp);
+ physid_set_mask_of_physid(apic_id, &tmp);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
@@ -2715,7 +2711,7 @@ void __init io_apic_init_mappings(void)
"address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
- skip_ioapic_setup = 1;
+ ioapic_is_disabled = true;
goto fake_ioapic_page;
}
#endif
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 9bfd6e397384..a44ba7209ef3 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/cpumask.h>
+#include <linux/delay.h>
#include <linux/smp.h>
+
#include <asm/io_apic.h>
#include "local.h"
@@ -52,9 +54,9 @@ void apic_send_IPI_allbutself(unsigned int vector)
return;
if (static_branch_likely(&apic_use_ipi_shorthand))
- apic->send_IPI_allbutself(vector);
+ __apic_send_IPI_allbutself(vector);
else
- apic->send_IPI_mask_allbutself(cpu_online_mask, vector);
+ __apic_send_IPI_mask_allbutself(cpu_online_mask, vector);
}
/*
@@ -68,12 +70,12 @@ void native_smp_send_reschedule(int cpu)
WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu);
return;
}
- apic->send_IPI(cpu, RESCHEDULE_VECTOR);
+ __apic_send_IPI(cpu, RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
+ __apic_send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR);
}
void native_send_call_func_ipi(const struct cpumask *mask)
@@ -85,14 +87,14 @@ void native_send_call_func_ipi(const struct cpumask *mask)
goto sendmask;
if (cpumask_test_cpu(cpu, mask))
- apic->send_IPI_all(CALL_FUNCTION_VECTOR);
+ __apic_send_IPI_all(CALL_FUNCTION_VECTOR);
else if (num_online_cpus() > 1)
- apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ __apic_send_IPI_allbutself(CALL_FUNCTION_VECTOR);
return;
}
sendmask:
- apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+ __apic_send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
}
#endif /* CONFIG_SMP */
@@ -102,74 +104,77 @@ static inline int __prepare_ICR2(unsigned int mask)
return SET_XAPIC_DEST_FIELD(mask);
}
-static inline void __xapic_wait_icr_idle(void)
+u32 apic_mem_wait_icr_idle_timeout(void)
+{
+ int cnt;
+
+ for (cnt = 0; cnt < 1000; cnt++) {
+ if (!(apic_read(APIC_ICR) & APIC_ICR_BUSY))
+ return 0;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ }
+ return APIC_ICR_BUSY;
+}
+
+void apic_mem_wait_icr_idle(void)
{
while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
cpu_relax();
}
-void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
+/*
+ * This is safe against interruption because it only writes the lower 32
+ * bits of the APIC_ICR register. The destination field is ignored for
+ * short hand IPIs.
+ *
+ * wait_icr_idle()
+ * write(ICR2, dest)
+ * NMI
+ * wait_icr_idle()
+ * write(ICR)
+ * wait_icr_idle()
+ * write(ICR)
+ *
+ * This function does not need to disable interrupts as there is no ICR2
+ * interaction. The memory write is direct except when the machine is
+ * affected by the 11AP Pentium erratum, which turns the plain write into
+ * an XCHG operation.
+ */
+static void __default_send_IPI_shortcut(unsigned int shortcut, int vector)
{
/*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
+ * Wait for the previous ICR command to complete. Use
+ * safe_apic_wait_icr_idle() for the NMI vector as there have been
+ * issues where otherwise the system hangs when the panic CPU tries
+ * to stop the others before launching the kdump kernel.
*/
if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
+ apic_mem_wait_icr_idle_timeout();
else
- __xapic_wait_icr_idle();
+ apic_mem_wait_icr_idle();
- /*
- * No need to touch the target chip field. Also the destination
- * mode is ignored when a shorthand is used.
- */
- cfg = __prepare_ICR(shortcut, vector, 0);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
+ /* Destination field (ICR2) and the destination mode are ignored */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(shortcut, vector, 0));
}
/*
* This is used to send an IPI with no shorthand notation (the destination is
* specified in bits 56 to 63 of the ICR).
*/
-void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+void __default_send_IPI_dest_field(unsigned int dest_mask, int vector,
+ unsigned int dest_mode)
{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
+ /* See comment in __default_send_IPI_shortcut() */
if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
+ apic_mem_wait_icr_idle_timeout();
else
- __xapic_wait_icr_idle();
+ apic_mem_wait_icr_idle();
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- native_apic_mem_write(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector, dest);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- native_apic_mem_write(APIC_ICR, cfg);
+ /* Set the IPI destination field in the ICR */
+ native_apic_mem_write(APIC_ICR2, __prepare_ICR2(dest_mask));
+ /* Send it with the proper destination mode */
+ native_apic_mem_write(APIC_ICR, __prepare_ICR(0, vector, dest_mode));
}
void default_send_IPI_single_phys(int cpu, int vector)
@@ -184,18 +189,13 @@ void default_send_IPI_single_phys(int cpu, int vector)
void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
{
- unsigned long query_cpu;
unsigned long flags;
+ unsigned long cpu;
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicast to each CPU instead.
- * - mbligh
- */
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
+ for_each_cpu(cpu, mask) {
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
@@ -203,18 +203,15 @@ void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
int vector)
{
- unsigned int this_cpu = smp_processor_id();
- unsigned int query_cpu;
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
- /* See Hack comment above */
-
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
continue;
__default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
- query_cpu), vector, APIC_DEST_PHYSICAL);
+ cpu), vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
@@ -224,7 +221,7 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
*/
void default_send_IPI_single(int cpu, int vector)
{
- apic->send_IPI_mask(cpumask_of(cpu), vector);
+ __apic_send_IPI_mask(cpumask_of(cpu), vector);
}
void default_send_IPI_allbutself(int vector)
@@ -243,50 +240,32 @@ void default_send_IPI_self(int vector)
}
#ifdef CONFIG_X86_32
-
-void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
- int vector)
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector)
{
unsigned long flags;
- unsigned int query_cpu;
-
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
- */
+ unsigned int cpu;
local_irq_save(flags);
- for_each_cpu(query_cpu, mask)
- __default_send_IPI_dest_field(
- early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
+ for_each_cpu(cpu, mask)
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
int vector)
{
+ unsigned int cpu, this_cpu = smp_processor_id();
unsigned long flags;
- unsigned int query_cpu;
- unsigned int this_cpu = smp_processor_id();
-
- /* See Hack comment above */
local_irq_save(flags);
- for_each_cpu(query_cpu, mask) {
- if (query_cpu == this_cpu)
+ for_each_cpu(cpu, mask) {
+ if (cpu == this_cpu)
continue;
- __default_send_IPI_dest_field(
- early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, APIC_DEST_LOGICAL);
- }
+ __default_send_IPI_dest_field(1U << cpu, vector, APIC_DEST_LOGICAL);
+ }
local_irq_restore(flags);
}
-/*
- * This is only used on smaller machines.
- */
void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
{
unsigned long mask = cpumask_bits(cpumask)[0];
@@ -302,7 +281,6 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
}
#ifdef CONFIG_SMP
-/* must come after the send_IPI functions above for inlining */
static int convert_apicid_to_cpu(int apic_id)
{
int i;
@@ -321,7 +299,7 @@ int safe_smp_processor_id(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return 0;
- apicid = hard_smp_processor_id();
+ apicid = read_apic_id();
if (apicid == BAD_APICID)
return 0;
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index a997d849509a..ec219c659c7d 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -13,18 +13,16 @@
#include <asm/irq_vectors.h>
#include <asm/apic.h>
-/* APIC flat 64 */
-void flat_init_apic_ldr(void);
-
/* X2APIC */
-int x2apic_apic_id_valid(u32 apicid);
-int x2apic_apic_id_registered(void);
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
unsigned int x2apic_get_apic_id(unsigned long id);
u32 x2apic_set_apic_id(unsigned int id);
int x2apic_phys_pkg_id(int initial_apicid, int index_msb);
+
+void x2apic_send_IPI_all(int vector);
+void x2apic_send_IPI_allbutself(int vector);
void x2apic_send_IPI_self(int vector);
-void __x2apic_send_IPI_shorthand(int vector, u32 which);
+extern u32 x2apic_max_apicid;
/* IPI */
@@ -46,7 +44,10 @@ static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
return icr;
}
-void __default_send_IPI_shortcut(unsigned int shortcut, int vector);
+void default_init_apic_ldr(void);
+
+void apic_mem_wait_icr_idle(void);
+u32 apic_mem_wait_icr_idle_timeout(void);
/*
* This is used to send an IPI with no shorthand notation (the destination is
@@ -62,8 +63,23 @@ void default_send_IPI_allbutself(int vector);
void default_send_IPI_all(int vector);
void default_send_IPI_self(int vector);
+bool default_apic_id_registered(void);
+
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_logical(const struct cpumask *mask, int vector);
+void x86_32_probe_bigsmp_early(void);
+void x86_32_install_bigsmp(void);
+#else
+static inline void x86_32_probe_bigsmp_early(void) { }
+static inline void x86_32_install_bigsmp(void) { }
+#endif
+
+#ifdef CONFIG_X86_BIGSMP
+bool apic_bigsmp_possible(bool cmdline_selected);
+void apic_bigsmp_force(void);
+#else
+static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; };
+static inline void apic_bigsmp_force(void) { }
#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 35d5b8fb18ef..6b6b711678fe 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -269,7 +269,7 @@ static const struct msi_parent_ops x86_vector_msi_parent_ops = {
struct irq_domain * __init native_create_pci_msi_domain(void)
{
- if (disable_apic)
+ if (apic_is_disabled)
return NULL;
x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index a61f642b1b90..9a06df6cdd68 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -10,46 +10,14 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#include <xen/xen.h>
+
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/acpi.h>
#include "local.h"
-static int default_x86_32_early_logical_apicid(int cpu)
-{
- return 1 << cpu;
-}
-
-static void setup_apic_flat_routing(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- printk(KERN_INFO
- "Enabling APIC mode: Flat. Using %d I/O APICs\n",
- nr_ioapics);
-#endif
-}
-
-static int default_apic_id_registered(void)
-{
- return physid_isset(read_apic_id(), phys_cpu_present_map);
-}
-
-/*
- * Set up the logical destination ID. Intel recommends to set DFR, LDR and
- * TPR before enabling an APIC. See e.g. "AP-388 82489DX User's Manual"
- * (Intel document number 292116).
- */
-static void default_init_apic_ldr(void)
-{
- unsigned long val;
-
- apic_write(APIC_DFR, APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
- apic_write(APIC_LDR, val);
-}
-
static int default_phys_pkg_id(int cpuid_apic, int index_msb)
{
return cpuid_apic >> index_msb;
@@ -65,8 +33,6 @@ static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
- .acpi_madt_oem_check = NULL,
- .apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
@@ -77,14 +43,11 @@ static struct apic apic_default __ro_after_init = {
.check_apicid_used = default_check_apicid_used,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
- .setup_apic_routing = setup_apic_flat_routing,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = default_phys_pkg_id,
+ .max_apic_id = 0xFE,
.get_apic_id = default_get_apic_id,
- .set_apic_id = NULL,
.calc_dest_apicid = apic_flat_calc_apicid,
@@ -95,17 +58,13 @@ static struct apic apic_default __ro_after_init = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .inquire_remote_apic = default_inquire_remote_apic,
-
.read = native_apic_mem_read,
.write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
+ .eoi = native_apic_mem_eoi,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
+ .wait_icr_idle = apic_mem_wait_icr_idle,
+ .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
apic_driver(apic_default);
@@ -123,7 +82,7 @@ static int __init parse_apic(char *arg)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if (!strcmp((*drv)->name, arg)) {
- apic = *drv;
+ apic_install_driver(*drv);
cmdline_apic = 1;
return 0;
}
@@ -134,49 +93,43 @@ static int __init parse_apic(char *arg)
}
early_param("apic", parse_apic);
-void __init default_setup_apic_routing(void)
+void __init x86_32_probe_bigsmp_early(void)
{
- int version = boot_cpu_apic_version;
+ if (nr_cpu_ids <= 8 || xen_pv_domain())
+ return;
- if (num_possible_cpus() > 8) {
+ if (IS_ENABLED(CONFIG_X86_BIGSMP)) {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(version)) {
- def_to_bigsmp = 0;
+ if (!APIC_XAPIC(boot_cpu_apic_version))
break;
- }
/* P4 and above */
fallthrough;
case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
+ if (apic_bigsmp_possible(cmdline_apic))
+ return;
+ break;
}
}
+ pr_info("Limiting to 8 possible CPUs\n");
+ set_nr_cpu_ids(8);
+}
-#ifdef CONFIG_X86_BIGSMP
- /*
- * This is used to switch to bigsmp mode when
- * - There is no apic= option specified by the user
- * - generic_apic_probe() has chosen apic_default as the sub_arch
- * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
- */
-
- if (!cmdline_apic && apic == &apic_default)
- generic_bigsmp_probe();
-#endif
-
- if (apic->setup_apic_routing)
- apic->setup_apic_routing();
+void __init x86_32_install_bigsmp(void)
+{
+ if (nr_cpu_ids > 8 && !xen_pv_domain())
+ apic_bigsmp_force();
}
-void __init generic_apic_probe(void)
+void __init x86_32_probe_apic(void)
{
if (!cmdline_apic) {
struct apic **drv;
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe()) {
- apic = *drv;
+ apic_install_driver(*drv);
break;
}
}
@@ -184,26 +137,4 @@ void __init generic_apic_probe(void)
if (drv == __apicdrivers_end)
panic("Didn't find an APIC driver");
}
- printk(KERN_INFO "Using APIC driver %s\n", apic->name);
-}
-
-/* This function can switch the APIC even after the initial ->probe() */
-int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- if (!(*drv)->acpi_madt_oem_check)
- continue;
- if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
- continue;
-
- if (!cmdline_apic) {
- apic = *drv;
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c46720f185c0..ecdf0c4121e1 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -13,10 +13,8 @@
#include "local.h"
-/*
- * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
- */
-void __init default_setup_apic_routing(void)
+/* Select the appropriate APIC driver */
+void __init x86_64_probe_apic(void)
{
struct apic **drv;
@@ -24,11 +22,7 @@ void __init default_setup_apic_routing(void)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->probe && (*drv)->probe()) {
- if (apic != *drv) {
- apic = *drv;
- pr_info("Switched APIC routing to %s.\n",
- apic->name);
- }
+ apic_install_driver(*drv);
break;
}
}
@@ -40,11 +34,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
- if (apic != *drv) {
- apic = *drv;
- pr_info("Setting APIC routing to %s.\n",
- apic->name);
- }
+ apic_install_driver(*drv);
return 1;
}
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index c1efebd27e6c..319448d87b99 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -44,7 +44,18 @@ static cpumask_var_t vector_searchmask;
static struct irq_chip lapic_controller;
static struct irq_matrix *vector_matrix;
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct hlist_head, cleanup_list);
+
+static void vector_cleanup_callback(struct timer_list *tmr);
+
+struct vector_cleanup {
+ struct hlist_head head;
+ struct timer_list timer;
+};
+
+static DEFINE_PER_CPU(struct vector_cleanup, vector_cleanup) = {
+ .head = HLIST_HEAD_INIT,
+ .timer = __TIMER_INITIALIZER(vector_cleanup_callback, TIMER_PINNED),
+};
#endif
void lock_vector_lock(void)
@@ -536,7 +547,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
struct irq_data *irqd;
int i, err, node;
- if (disable_apic)
+ if (apic_is_disabled)
return -ENXIO;
/*
@@ -680,7 +691,7 @@ static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
* if IRQ remapping is enabled. APIC IDs above 15 bits are
* only permitted if IRQ remapping is enabled, so check that.
*/
- if (apic->apic_id_valid(32768))
+ if (apic_id_valid(32768))
return 0;
return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
@@ -841,10 +852,21 @@ void lapic_online(void)
this_cpu_write(vector_irq[vector], __setup_vector_irq(vector));
}
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr);
+
void lapic_offline(void)
{
+ struct vector_cleanup *cl = this_cpu_ptr(&vector_cleanup);
+
lock_vector_lock();
+
+ /* In case the vector cleanup timer has not expired */
+ __vector_cleanup(cl, false);
+
irq_matrix_offline(vector_matrix);
+ WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0);
+ WARN_ON_ONCE(!hlist_empty(&cl->head));
+
unlock_vector_lock();
}
@@ -876,7 +898,7 @@ static int apic_retrigger_irq(struct irq_data *irqd)
unsigned long flags;
raw_spin_lock_irqsave(&vector_lock, flags);
- apic->send_IPI(apicd->cpu, apicd->vector);
+ __apic_send_IPI(apicd->cpu, apicd->vector);
raw_spin_unlock_irqrestore(&vector_lock, flags);
return 1;
@@ -885,7 +907,7 @@ static int apic_retrigger_irq(struct irq_data *irqd)
void apic_ack_irq(struct irq_data *irqd)
{
irq_move_irq(irqd);
- ack_APIC_irq();
+ apic_eoi();
}
void apic_ack_edge(struct irq_data *irqd)
@@ -934,62 +956,98 @@ static void free_moved_vector(struct apic_chip_data *apicd)
apicd->move_in_progress = 0;
}
-DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
+static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
{
- struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
struct apic_chip_data *apicd;
struct hlist_node *tmp;
+ bool rearm = false;
- ack_APIC_irq();
- /* Prevent vectors vanishing under us */
- raw_spin_lock(&vector_lock);
+ lockdep_assert_held(&vector_lock);
- hlist_for_each_entry_safe(apicd, tmp, clhead, clist) {
+ hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
unsigned int irr, vector = apicd->prev_vector;
/*
* Paranoia: Check if the vector that needs to be cleaned
- * up is registered at the APICs IRR. If so, then this is
- * not the best time to clean it up. Clean it up in the
- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to this CPU. IRQ_MOVE_CLEANUP_VECTOR is the lowest
- * priority external vector, so on return from this
- * interrupt the device interrupt will happen first.
+ * up is registered at the APICs IRR. That's clearly a
+ * hardware issue if the vector arrived on the old target
+ * _after_ interrupts were disabled above. Keep @apicd
+ * on the list and schedule the timer again to give the CPU
+ * a chance to handle the pending interrupt.
+ *
+ * Do not check IRR when called from lapic_offline(), because
+ * fixup_irqs() was just called to scan IRR for set bits and
+ * forward them to new destination CPUs via IPIs.
*/
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0;
if (irr & (1U << (vector % 32))) {
- apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
+ rearm = true;
continue;
}
free_moved_vector(apicd);
}
- raw_spin_unlock(&vector_lock);
+ /*
+ * Must happen under vector_lock to make the timer_pending() check
+ * in __vector_schedule_cleanup() race free against the rearm here.
+ */
+ if (rearm)
+ mod_timer(&cl->timer, jiffies + 1);
+}
+
+static void vector_cleanup_callback(struct timer_list *tmr)
+{
+ struct vector_cleanup *cl = container_of(tmr, typeof(*cl), timer);
+
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock_irq(&vector_lock);
+ __vector_cleanup(cl, true);
+ raw_spin_unlock_irq(&vector_lock);
}
-static void __send_cleanup_vector(struct apic_chip_data *apicd)
+static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
{
- unsigned int cpu;
+ unsigned int cpu = apicd->prev_cpu;
raw_spin_lock(&vector_lock);
apicd->move_in_progress = 0;
- cpu = apicd->prev_cpu;
if (cpu_online(cpu)) {
- hlist_add_head(&apicd->clist, per_cpu_ptr(&cleanup_list, cpu));
- apic->send_IPI(cpu, IRQ_MOVE_CLEANUP_VECTOR);
+ struct vector_cleanup *cl = per_cpu_ptr(&vector_cleanup, cpu);
+
+ hlist_add_head(&apicd->clist, &cl->head);
+
+ /*
+ * The lockless timer_pending() check is safe here. If it
+ * returns true, then the callback will observe this new
+ * apic data in the hlist as everything is serialized by
+ * vector lock.
+ *
+ * If it returns false then the timer is either not armed
+ * or the other CPU executes the callback, which again
+ * would be blocked on vector lock. Rearming it in the
+ * latter case makes it fire for nothing.
+ *
+ * This is also safe against the callback rearming the timer
+ * because that's serialized via vector lock too.
+ */
+ if (!timer_pending(&cl->timer)) {
+ cl->timer.expires = jiffies + 1;
+ add_timer_on(&cl->timer, cpu);
+ }
} else {
apicd->prev_vector = 0;
}
raw_spin_unlock(&vector_lock);
}
-void send_cleanup_vector(struct irq_cfg *cfg)
+void vector_schedule_cleanup(struct irq_cfg *cfg)
{
struct apic_chip_data *apicd;
apicd = container_of(cfg, struct apic_chip_data, hw_irq_cfg);
if (apicd->move_in_progress)
- __send_cleanup_vector(apicd);
+ __vector_schedule_cleanup(apicd);
}
void irq_complete_move(struct irq_cfg *cfg)
@@ -1007,7 +1065,7 @@ void irq_complete_move(struct irq_cfg *cfg)
* on the same CPU.
*/
if (apicd->cpu == smp_processor_id())
- __send_cleanup_vector(apicd);
+ __vector_schedule_cleanup(apicd);
}
/*
@@ -1150,7 +1208,7 @@ static void __init print_local_APIC(void *dummy)
u64 icr;
pr_debug("printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
+ smp_processor_id(), read_apic_id());
v = apic_read(APIC_ID);
pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
v = apic_read(APIC_LVR);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b2b2b7f3e03f..affbff65e497 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -83,16 +83,6 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_allbutself(int vector)
-{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
-}
-
-static void x2apic_send_IPI_all(int vector)
-{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
-}
-
static u32 x2apic_calc_apicid(unsigned int cpu)
{
return x86_cpu_to_logical_apicid[cpu];
@@ -236,8 +226,6 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_valid = x2apic_apic_id_valid,
- .apic_id_registered = x2apic_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
@@ -247,12 +235,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = x2apic_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -265,15 +252,11 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .inquire_remote_apic = NULL,
-
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 896bc41cb2ba..788cdb4ee394 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -8,11 +8,13 @@
int x2apic_phys;
static struct apic apic_x2apic_phys;
-static u32 x2apic_max_apicid __ro_after_init;
+u32 x2apic_max_apicid __ro_after_init = UINT_MAX;
void __init x2apic_set_max_apicid(u32 apicid)
{
x2apic_max_apicid = apicid;
+ if (apic->x2apic_set_max_apicid)
+ apic->max_apic_id = apicid;
}
static int __init set_x2apic_phys_mode(char *arg)
@@ -81,43 +83,28 @@ static void
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
-static void x2apic_send_IPI_allbutself(int vector)
+static void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
-}
+ unsigned long cfg = __prepare_ICR(which, vector, 0);
-static void x2apic_send_IPI_all(int vector)
-{
- __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
+ /* x2apic MSRs are special and need a special fence: */
+ weak_wrmsr_fence();
+ native_x2apic_icr_write(cfg, 0);
}
-static void init_x2apic_ldr(void)
+void x2apic_send_IPI_allbutself(int vector)
{
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
-static int x2apic_phys_probe(void)
-{
- if (!x2apic_mode)
- return 0;
-
- if (x2apic_phys || x2apic_fadt_phys())
- return 1;
-
- return apic == &apic_x2apic_phys;
-}
-
-/* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(u32 apicid)
+void x2apic_send_IPI_all(int vector)
{
- if (x2apic_max_apicid && apicid > x2apic_max_apicid)
- return 0;
-
- return 1;
+ __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
-int x2apic_apic_id_registered(void)
+void x2apic_send_IPI_self(int vector)
{
- return 1;
+ apic_write(APIC_SELF_IPI, vector);
}
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
@@ -126,13 +113,15 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
native_x2apic_icr_write(cfg, apicid);
}
-void __x2apic_send_IPI_shorthand(int vector, u32 which)
+static int x2apic_phys_probe(void)
{
- unsigned long cfg = __prepare_ICR(which, vector, 0);
+ if (!x2apic_mode)
+ return 0;
- /* x2apic MSRs are special and need a special fence: */
- weak_wrmsr_fence();
- native_x2apic_icr_write(cfg, 0);
+ if (x2apic_phys || x2apic_fadt_phys())
+ return 1;
+
+ return apic == &apic_x2apic_phys;
}
unsigned int x2apic_get_apic_id(unsigned long id)
@@ -150,33 +139,22 @@ int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
return initial_apicid >> index_msb;
}
-void x2apic_send_IPI_self(int vector)
-{
- apic_write(APIC_SELF_IPI, vector);
-}
-
static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .apic_id_valid = x2apic_apic_id_valid,
- .apic_id_registered = x2apic_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = init_x2apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = x2apic_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
+ .x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -189,15 +167,11 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .inquire_remote_apic = NULL,
-
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b524dee1cbbb..d9f5d7492f83 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -25,6 +25,8 @@
#include <asm/uv/uv.h>
#include <asm/apic.h>
+#include "local.h"
+
static enum uv_system_type uv_system_type;
static int uv_hubbed_system;
static int uv_hubless_system;
@@ -777,30 +779,6 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static int uv_apic_id_valid(u32 apicid)
-{
- return 1;
-}
-
-static int uv_apic_id_registered(void)
-{
- return 1;
-}
-
-static void uv_init_apic_ldr(void)
-{
-}
-
-static u32 apic_uv_calc_apicid(unsigned int cpu)
-{
- return apic_default_calc_apicid(cpu);
-}
-
-static unsigned int x2apic_get_apic_id(unsigned long id)
-{
- return id;
-}
-
static u32 set_apic_id(unsigned int id)
{
return id;
@@ -816,11 +794,6 @@ static int uv_phys_pkg_id(int initial_apicid, int index_msb)
return uv_read_apic_id() >> index_msb;
}
-static void uv_send_IPI_self(int vector)
-{
- apic_write(APIC_SELF_IPI, vector);
-}
-
static int uv_probe(void)
{
return apic == &apic_x2apic_uv_x;
@@ -831,45 +804,35 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .apic_id_valid = uv_apic_id_valid,
- .apic_id_registered = uv_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
- .check_apicid_used = NULL,
- .init_apic_ldr = uv_init_apic_ldr,
- .ioapic_phys_id_map = NULL,
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .apicid_to_cpu_present = NULL,
- .check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = uv_phys_pkg_id,
+ .max_apic_id = UINT_MAX,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
- .calc_dest_apicid = apic_uv_calc_apicid,
+ .calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = uv_send_IPI_one,
.send_IPI_mask = uv_send_IPI_mask,
.send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_all = uv_send_IPI_all,
- .send_IPI_self = uv_send_IPI_self,
+ .send_IPI_self = x2apic_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
- .eoi_write = native_apic_msr_eoi_write,
+ .eoi = native_apic_msr_eoi,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
- .wait_icr_idle = native_x2apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
@@ -1844,7 +1807,7 @@ static void __init uv_system_init_hub(void)
/* Initialize per CPU info: */
for_each_possible_cpu(cpu) {
- int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
unsigned short bid;
unsigned short pnode;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c6c15ce1952f..5934ee5bc087 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -239,12 +239,6 @@ extern int (*console_blank_hook)(int);
#endif
/*
- * The apm_bios device is one of the misc char devices.
- * This is its minor number.
- */
-#define APM_MINOR_DEV 134
-
-/*
* Various options can be changed at boot time as follows:
* (We allow underscores for compatibility with the modules code)
* apm=on/off enable/disable APM
diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c
index 44c3601cfdc4..190c120f4285 100644
--- a/arch/x86/kernel/audit_64.c
+++ b/arch/x86/kernel/audit_64.c
@@ -63,11 +63,6 @@ int audit_classify_syscall(int abi, unsigned syscall)
static int __init audit_classes_init(void)
{
#ifdef CONFIG_IA32_EMULATION
- extern __u32 ia32_dir_class[];
- extern __u32 ia32_write_class[];
- extern __u32 ia32_read_class[];
- extern __u32 ia32_chattr_class[];
- extern __u32 ia32_signal_class[];
audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..d2c732a34e5d
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <asm/bugs.h>
+#include <asm/traps.h>
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+static const char cp_err[][10] = {
+ [0] = "unknown",
+ [1] = "near ret",
+ [2] = "far/iret",
+ [3] = "endbranch",
+ [4] = "rstorssp",
+ [5] = "setssbsy",
+};
+
+static const char *cp_err_string(unsigned long error_code)
+{
+ unsigned int cpec = error_code & CP_EC;
+
+ if (cpec >= ARRAY_SIZE(cp_err))
+ cpec = 0;
+ return cp_err[cpec];
+}
+
+static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code)
+{
+ WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n",
+ user_mode(regs) ? "user mode" : "kernel mode",
+ cp_err_string(error_code));
+}
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long ssp;
+
+ /*
+ * An exception was just taken from userspace. Since interrupts are disabled
+ * here, no scheduling should have messed with the registers yet and they
+ * will be whatever is live in userspace. So read the SSP before enabling
+ * interrupts so locking the fpregs to do it later is not required.
+ */
+ rdmsrl(MSR_IA32_PL3_SSP, ssp);
+
+ cond_local_irq_enable(regs);
+
+ tsk = current;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ /* Ratelimit to prevent log spamming. */
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ __ratelimit(&cpf_rate)) {
+ pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, ssp, error_code,
+ cp_err_string(error_code),
+ error_code & CP_ENCL ? " in enclave" : "");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0);
+ cond_local_irq_disable(regs);
+}
+
+static __ro_after_init bool ibt_fatal = true;
+
+static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ if ((error_code & CP_EC) != CP_ENDBR) {
+ do_unexpected_cp(regs, error_code);
+ return;
+ }
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) {
+ regs->ax = 0;
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ return;
+ }
+ BUG();
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (user_mode(regs)) {
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ do_user_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ } else {
+ if (cpu_feature_enabled(X86_FEATURE_IBT))
+ do_kernel_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ }
+}
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 485441b7f030..bfeb18fad63f 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -51,7 +51,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7eca6a8abbb1..dd8379d84445 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1047,7 +1047,7 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_FSRS);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
+ c->apicid = read_apic_id();
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 41b573f34a10..382d4e6b848d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -587,27 +587,43 @@ __noendbr void ibt_restore(u64 save)
static __always_inline void setup_cet(struct cpuinfo_x86 *c)
{
- u64 msr = CET_ENDBR_EN;
+ bool user_shstk, kernel_ibt;
- if (!HAS_KERNEL_IBT ||
- !cpu_feature_enabled(X86_FEATURE_IBT))
+ if (!IS_ENABLED(CONFIG_X86_CET))
return;
- wrmsrl(MSR_IA32_S_CET, msr);
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrl(MSR_IA32_S_CET, 0);
+
cr4_set_bits(X86_CR4_CET);
- if (!ibt_selftest()) {
+ if (kernel_ibt && ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
wrmsrl(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
- return;
}
}
__noendbr void cet_disable(void)
{
- if (cpu_feature_enabled(X86_FEATURE_IBT))
- wrmsrl(MSR_IA32_S_CET, 0);
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrl(MSR_IA32_S_CET, 0);
+ wrmsrl(MSR_IA32_U_CET, 0);
}
/*
@@ -1264,11 +1280,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
@@ -1491,6 +1507,9 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
@@ -1958,7 +1977,7 @@ void enable_sep_cpu(void)
}
#endif
-void __init identify_boot_cpu(void)
+static __init void identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index f6748c8bd647..e462c1d3800a 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{}
};
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 5a2962c492d3..defdc594be14 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -8,6 +8,7 @@
*/
#include <linux/io.h>
+#include <asm/apic.h>
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -300,7 +301,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
+ c->apicid = read_apic_id();
/*
* XXX someone from Hygon needs to confirm this DTRT
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c4ec4ca47e11..c267f43de39e 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -759,7 +759,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
}
/*
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 12cf2e7ca33c..4d8d4bcf915d 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -270,8 +270,7 @@ static void __maybe_unused raise_mce(struct mce *m)
mce_irq_ipi, NULL, 0);
preempt_enable();
} else if (m->inject_flags & MCJ_NMI_BROADCAST)
- apic->send_IPI_mask(mce_inject_cpumask,
- NMI_VECTOR);
+ __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
}
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index 6a059a035021..ef4e7bb5fd88 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -27,5 +27,5 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c7969e806c64..e6bba12c759c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -32,6 +32,7 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
+#include <asm/svm.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -39,6 +40,10 @@ bool hv_root_partition;
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
+/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */
+bool hyperv_paravisor_present __ro_after_init;
+EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
+
#if IS_ENABLED(CONFIG_HYPERV)
static inline unsigned int hv_get_nested_reg(unsigned int reg)
{
@@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg)
{
u64 value;
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
- hv_ghcb_msr_read(reg, &value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
else
rdmsrl(reg, value);
return value;
@@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
void hv_set_non_nested_register(unsigned int reg, u64 value)
{
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
- hv_ghcb_msr_write(reg, value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) {
+ hv_ivm_msr_write(reg, value);
/* Write proxy bit via wrmsl instruction */
if (hv_is_sint_reg(reg))
@@ -119,7 +124,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
vmbus_handler();
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
- ack_APIC_irq();
+ apic_eoi();
set_irq_regs(old_regs);
}
@@ -147,7 +152,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
set_irq_regs(old_regs);
}
@@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
}
#endif
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
@@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.shared_gpa_boundary =
BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+ hyperv_paravisor_present = !!ms_hyperv.paravisor_present;
+
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ /* To be supported: more work is required. */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /* HV_REGISTER_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void)
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
- (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ ms_hyperv.paravisor_present)
hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
@@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
+ if (hv_root_partition ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
@@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 099b6f0d96bd..31c0e68f6227 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -4,6 +4,8 @@
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
#include "cpu.h"
@@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 458cb7419502..8f559eeae08e 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -45,7 +45,21 @@ static u64 prefetch_disable_bits;
*/
static unsigned int pseudo_lock_major;
static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
-static struct class *pseudo_lock_class;
+
+static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
+{
+ const struct rdtgroup *rdtgrp;
+
+ rdtgrp = dev_get_drvdata(dev);
+ if (mode)
+ *mode = 0600;
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
+}
+
+static const struct class pseudo_lock_class = {
+ .name = "pseudo_lock",
+ .devnode = pseudo_lock_devnode,
+};
/**
* get_prefetch_disable_bits - prefetch disable bits of supported platforms
@@ -1353,7 +1367,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
&pseudo_measure_fops);
}
- dev = device_create(pseudo_lock_class, NULL,
+ dev = device_create(&pseudo_lock_class, NULL,
MKDEV(pseudo_lock_major, new_minor),
rdtgrp, "%s", rdtgrp->kn->name);
@@ -1383,7 +1397,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
goto out;
out_device:
- device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+ device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
out_debugfs:
debugfs_remove_recursive(plr->debugfs_dir);
pseudo_lock_minor_release(new_minor);
@@ -1424,7 +1438,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
pseudo_lock_cstates_relax(plr);
debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
- device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+ device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
pseudo_lock_minor_release(plr->minor);
free:
@@ -1560,16 +1574,6 @@ static const struct file_operations pseudo_lock_dev_fops = {
.mmap = pseudo_lock_dev_mmap,
};
-static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
-{
- const struct rdtgroup *rdtgrp;
-
- rdtgrp = dev_get_drvdata(dev);
- if (mode)
- *mode = 0600;
- return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
-}
-
int rdt_pseudo_lock_init(void)
{
int ret;
@@ -1580,21 +1584,18 @@ int rdt_pseudo_lock_init(void)
pseudo_lock_major = ret;
- pseudo_lock_class = class_create("pseudo_lock");
- if (IS_ERR(pseudo_lock_class)) {
- ret = PTR_ERR(pseudo_lock_class);
+ ret = class_register(&pseudo_lock_class);
+ if (ret) {
unregister_chrdev(pseudo_lock_major, "pseudo_lock");
return ret;
}
- pseudo_lock_class->devnode = pseudo_lock_devnode;
return 0;
}
void rdt_pseudo_lock_release(void)
{
- class_destroy(pseudo_lock_class);
- pseudo_lock_class = NULL;
+ class_unregister(&pseudo_lock_class);
unregister_chrdev(pseudo_lock_major, "pseudo_lock");
pseudo_lock_major = 0;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index bdc0d5539b57..dae436253de4 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,7 +40,6 @@
#include <asm/processor.h>
#include <asm/msr.h>
-static struct class *cpuid_class;
static enum cpuhp_state cpuhp_cpuid_state;
struct cpuid_regs_done {
@@ -124,26 +123,31 @@ static const struct file_operations cpuid_fops = {
.open = cpuid_open,
};
+static char *cpuid_devnode(const struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
+}
+
+static const struct class cpuid_class = {
+ .name = "cpuid",
+ .devnode = cpuid_devnode,
+};
+
static int cpuid_device_create(unsigned int cpu)
{
struct device *dev;
- dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
+ dev = device_create(&cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
"cpu%d", cpu);
return PTR_ERR_OR_ZERO(dev);
}
static int cpuid_device_destroy(unsigned int cpu)
{
- device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ device_destroy(&cpuid_class, MKDEV(CPUID_MAJOR, cpu));
return 0;
}
-static char *cpuid_devnode(const struct device *dev, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
-}
-
static int __init cpuid_init(void)
{
int err;
@@ -154,12 +158,9 @@ static int __init cpuid_init(void)
CPUID_MAJOR);
return -EBUSY;
}
- cpuid_class = class_create("cpuid");
- if (IS_ERR(cpuid_class)) {
- err = PTR_ERR(cpuid_class);
+ err = class_register(&cpuid_class);
+ if (err)
goto out_chrdev;
- }
- cpuid_class->devnode = cpuid_devnode;
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
cpuid_device_create, cpuid_device_destroy);
@@ -170,7 +171,7 @@ static int __init cpuid_init(void)
return 0;
out_class:
- class_destroy(cpuid_class);
+ class_unregister(&cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
return err;
@@ -180,7 +181,7 @@ module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
cpuhp_remove_state(cpuhp_cpuid_state);
- class_destroy(cpuid_class);
+ class_unregister(&cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
}
module_exit(cpuid_exit);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index cdd92ab43cda..587c7743fd21 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -158,8 +158,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
crash_save_cpu(regs, safe_smp_processor_id());
}
-#ifdef CONFIG_KEXEC_FILE
-
+#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -231,7 +230,7 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
/* Prepare elf headers. Return addr and size */
static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz)
+ unsigned long *sz, unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
int ret;
@@ -249,6 +248,9 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
if (ret)
goto out;
+ /* Return the computed number of memory ranges, for hotplug usage */
+ *nr_mem_ranges = cmem->nr_ranges;
+
/* By default prepare 64bit headers */
ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
@@ -256,7 +258,9 @@ out:
vfree(cmem);
return ret;
}
+#endif
+#ifdef CONFIG_KEXEC_FILE
static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
{
unsigned int nr_e820_entries;
@@ -371,18 +375,42 @@ out:
int crash_load_segments(struct kimage *image)
{
int ret;
+ unsigned long pnum = 0;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
.buf_max = ULONG_MAX, .top_down = false };
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
+ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum);
if (ret)
return ret;
- image->elf_headers = kbuf.buffer;
- image->elf_headers_sz = kbuf.bufsz;
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
+ kbuf.memsz = kbuf.bufsz;
+
+#ifdef CONFIG_CRASH_HOTPLUG
+ /*
+ * The elfcorehdr segment size accounts for VMCOREINFO, kernel_map,
+ * maximum CPUs and maximum memory ranges.
+ */
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ pnum = 2 + CONFIG_NR_CPUS_DEFAULT + CONFIG_CRASH_MAX_MEMORY_RANGES;
+ else
+ pnum += 2 + CONFIG_NR_CPUS_DEFAULT;
+
+ if (pnum < (unsigned long)PN_XNUM) {
+ kbuf.memsz = pnum * sizeof(Elf64_Phdr);
+ kbuf.memsz += sizeof(Elf64_Ehdr);
+
+ image->elfcorehdr_index = image->nr_segments;
+
+ /* Mark as usable to crash kernel, else crash kernel fails on boot */
+ image->elf_headers_sz = kbuf.memsz;
+ } else {
+ pr_err("number of Phdrs %lu exceeds max\n", pnum);
+ }
+#endif
- kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
@@ -395,3 +423,103 @@ int crash_load_segments(struct kimage *image)
return ret;
}
#endif /* CONFIG_KEXEC_FILE */
+
+#ifdef CONFIG_CRASH_HOTPLUG
+
+#undef pr_fmt
+#define pr_fmt(fmt) "crash hp: " fmt
+
+/* These functions provide the value for the sysfs crash_hotplug nodes */
+#ifdef CONFIG_HOTPLUG_CPU
+int arch_crash_hotplug_cpu_support(void)
+{
+ return crash_check_update_elfcorehdr();
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_crash_hotplug_memory_support(void)
+{
+ return crash_check_update_elfcorehdr();
+}
+#endif
+
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+ unsigned int sz;
+
+ /* kernel_map, VMCOREINFO and maximum CPUs */
+ sz = 2 + CONFIG_NR_CPUS_DEFAULT;
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+ sz += CONFIG_CRASH_MAX_MEMORY_RANGES;
+ sz *= sizeof(Elf64_Phdr);
+ return sz;
+}
+
+/**
+ * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
+ * @image: a pointer to kexec_crash_image
+ *
+ * Prepare the new elfcorehdr and replace the existing elfcorehdr.
+ */
+void arch_crash_handle_hotplug_event(struct kimage *image)
+{
+ void *elfbuf = NULL, *old_elfcorehdr;
+ unsigned long nr_mem_ranges;
+ unsigned long mem, memsz;
+ unsigned long elfsz = 0;
+
+ /*
+ * As crash_prepare_elf64_headers() has already described all
+ * possible CPUs, there is no need to update the elfcorehdr
+ * for additional CPU changes.
+ */
+ if ((image->file_mode || image->elfcorehdr_updated) &&
+ ((image->hp_action == KEXEC_CRASH_HP_ADD_CPU) ||
+ (image->hp_action == KEXEC_CRASH_HP_REMOVE_CPU)))
+ return;
+
+ /*
+ * Create the new elfcorehdr reflecting the changes to CPU and/or
+ * memory resources.
+ */
+ if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) {
+ pr_err("unable to create new elfcorehdr");
+ goto out;
+ }
+
+ /*
+ * Obtain address and size of the elfcorehdr segment, and
+ * check it against the new elfcorehdr buffer.
+ */
+ mem = image->segment[image->elfcorehdr_index].mem;
+ memsz = image->segment[image->elfcorehdr_index].memsz;
+ if (elfsz > memsz) {
+ pr_err("update elfcorehdr elfsz %lu > memsz %lu",
+ elfsz, memsz);
+ goto out;
+ }
+
+ /*
+ * Copy new elfcorehdr over the old elfcorehdr at destination.
+ */
+ old_elfcorehdr = kmap_local_page(pfn_to_page(mem >> PAGE_SHIFT));
+ if (!old_elfcorehdr) {
+ pr_err("mapping elfcorehdr segment failed\n");
+ goto out;
+ }
+
+ /*
+ * Temporarily invalidate the crash image while the
+ * elfcorehdr is updated.
+ */
+ xchg(&kexec_crash_image, NULL);
+ memcpy_flushcache(old_elfcorehdr, elfbuf, elfsz);
+ xchg(&kexec_crash_image, image);
+ kunmap_local(old_elfcorehdr);
+ pr_debug("updated elfcorehdr\n");
+
+out:
+ vfree(elfbuf);
+}
+#endif
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 28da5dd83fc0..87d38f17ff5c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -128,16 +128,15 @@ static void __init dtb_setup_hpet(void)
static void __init dtb_cpu_setup(void)
{
struct device_node *dn;
- u32 apic_id, version;
+ u32 apic_id;
- version = GET_APIC_VERSION(apic_read(APIC_LVR));
for_each_of_cpu_node(dn) {
apic_id = of_get_cpu_hwid(dn, 0);
if (apic_id == ~0U) {
pr_warn("%pOF: missing local APIC ID\n", dn);
continue;
}
- generic_processor_info(apic_id, version);
+ generic_processor_info(apic_id);
}
}
@@ -158,19 +157,15 @@ static void __init dtb_lapic_setup(void)
/* Did the boot loader setup the local APIC ? */
if (!boot_cpu_has(X86_FEATURE_APIC)) {
- if (apic_force_enable(lapic_addr))
+ /* Try force enabling, which registers the APIC address */
+ if (!apic_force_enable(lapic_addr))
return;
- }
- smp_found_config = 1;
- if (of_property_read_bool(dn, "intel,virtual-wire-mode")) {
- pr_info("Virtual Wire compatibility mode.\n");
- pic_mode = 0;
} else {
- pr_info("IMCR and PIC compatibility mode.\n");
- pic_mode = 1;
+ register_lapic_address(lapic_addr);
}
-
- register_lapic_address(lapic_addr);
+ smp_found_config = 1;
+ pic_mode = !of_property_read_bool(dn, "intel,virtual-wire-mode");
+ pr_info("%s compatibility mode.\n", pic_mode ? "IMCR and PIC" : "Virtual Wire");
}
#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 98e507cc7d34..a86d37052a64 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu)
}
}
+/* A passed ssp of zero will not cause any update */
+static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
+{
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ struct cet_user_state *xstate;
+
+ /* If ssp update is not needed. */
+ if (!ssp)
+ return 0;
+
+ xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave,
+ XFEATURE_CET_USER);
+
+ /*
+ * If there is a non-zero ssp, then 'dst' must be configured with a shadow
+ * stack and the fpu state should be up to date since it was just copied
+ * from the parent in fpu_clone(). So there must be a valid non-init CET
+ * state location in the buffer.
+ */
+ if (WARN_ON_ONCE(!xstate))
+ return 1;
+
+ xstate->user_ssp = (u64)ssp;
+#endif
+ return 0;
+}
+
/* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+ unsigned long ssp)
{
struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
@@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
if (use_xsave())
dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
+ /*
+ * Update shadow stack pointer, in case it changed during clone.
+ */
+ if (update_fpu_shstk(dst, ssp))
+ return 1;
+
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -753,6 +787,24 @@ void switch_fpu_return(void)
}
EXPORT_SYMBOL_GPL(switch_fpu_return);
+void fpregs_lock_and_load(void)
+{
+ /*
+ * fpregs_lock() only disables preemption (mostly). So modifying state
+ * in an interrupt could screw up some in progress fpregs operation.
+ * Warn about it.
+ */
+ WARN_ON_ONCE(!irq_fpu_usable());
+ WARN_ON_ONCE(current->flags & PF_KTHREAD);
+
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+}
+
#ifdef CONFIG_X86_DEBUG_FPU
/*
* If current FPU state according to its tracking (loaded FPU context on this
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 6d056b68f4ed..6bc1eb2a21bd 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,6 +8,7 @@
#include <asm/fpu/api.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/prctl.h>
#include "context.h"
#include "internal.h"
@@ -174,6 +175,86 @@ out:
return ret;
}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+int ssp_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (target->thread.features & ARCH_SHSTK_SHSTK)
+ return regset->n;
+
+ return 0;
+}
+
+int ssp_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct cet_user_state *cetregs;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+ cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ return membuf_write(&to, (unsigned long *)&cetregs->user_ssp,
+ sizeof(cetregs->user_ssp));
+}
+
+int ssp_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave = &fpu->fpstate->regs.xsave;
+ struct cet_user_state *cetregs;
+ unsigned long user_ssp;
+ int r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ if (pos != 0 || count != sizeof(user_ssp))
+ return -EINVAL;
+
+ r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1);
+ if (r)
+ return r;
+
+ /*
+ * Some kernel instructions (IRET, etc) can cause exceptions in the case
+ * of disallowed CET register values. Just prevent invalid values.
+ */
+ if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8))
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ cetregs->user_ssp = user_ssp;
+ return 0;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 1afbc4866b10..cadf68737e6b 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -39,26 +39,26 @@
*/
static const char *xfeature_names[] =
{
- "x87 floating point registers" ,
- "SSE registers" ,
- "AVX registers" ,
- "MPX bounds registers" ,
- "MPX CSR" ,
- "AVX-512 opmask" ,
- "AVX-512 Hi256" ,
- "AVX-512 ZMM_Hi256" ,
- "Processor Trace (unused)" ,
+ "x87 floating point registers",
+ "SSE registers",
+ "AVX registers",
+ "MPX bounds registers",
+ "MPX CSR",
+ "AVX-512 opmask",
+ "AVX-512 Hi256",
+ "AVX-512 ZMM_Hi256",
+ "Processor Trace (unused)",
"Protection Keys User registers",
"PASID state",
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "AMX Tile config" ,
- "AMX Tile data" ,
- "unknown xstate feature" ,
+ "Control-flow User registers",
+ "Control-flow Kernel registers (unused)",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "AMX Tile config",
+ "AMX Tile data",
+ "unknown xstate feature",
};
static unsigned short xsave_cpuid_features[] __initdata = {
@@ -71,8 +71,9 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F,
[XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F,
[XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
- [XFEATURE_PKRU] = X86_FEATURE_PKU,
+ [XFEATURE_PKRU] = X86_FEATURE_OSPKE,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
};
@@ -276,6 +277,7 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
print_xstate_feature(XFEATURE_MASK_PASID);
+ print_xstate_feature(XFEATURE_MASK_CET_USER);
print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}
@@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER | \
XFEATURE_MASK_XTILE)
/*
@@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void)
} \
} while (0)
-#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
- if ((nr == nr_macro) && \
- WARN_ONCE(sz != sizeof(__struct), \
- "%s: struct is %zu bytes, cpu state %d bytes\n", \
- __stringify(nr_macro), sizeof(__struct), sz)) { \
+#define XCHECK_SZ(sz, nr, __struct) ({ \
+ if (WARN_ONCE(sz != sizeof(__struct), \
+ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
+ xfeature_names[nr], sizeof(__struct), sz)) { \
__xstate_dump_leaves(); \
} \
-} while (0)
+ true; \
+})
+
/**
* check_xtile_data_against_struct - Check tile data state size.
@@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr)
* Ask the CPU for the size of the state.
*/
int sz = xfeature_size(nr);
+
/*
* Match each CPU state with the corresponding software
* structure.
*/
- XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
- XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
- XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
- XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
- XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
- XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
- XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
- XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
- XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
-
- /* The tile data size varies between implementations. */
- if (nr == XFEATURE_XTILE_DATA)
- check_xtile_data_against_struct(sz);
-
- /*
- * Make *SURE* to add any feature numbers in below if
- * there are "holes" in the xsave state component
- * numbers.
- */
- if ((nr < XFEATURE_YMM) ||
- (nr >= XFEATURE_MAX) ||
- (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
- ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
+ switch (nr) {
+ case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
+ case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
+ case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
+ case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
+ case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
+ case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
+ case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
+ case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
+ case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
+ case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
+ default:
XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
return false;
}
+
return true;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 4d8aff05a509..30a55207c000 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -231,9 +231,7 @@ struct irq_chip i8259A_chip = {
};
static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
+/* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */
static void restore_ELCR(char *trigger)
{
outb(trigger[0], PIC_ELCR1);
diff --git a/arch/x86/kernel/ibt_selftest.S b/arch/x86/kernel/ibt_selftest.S
new file mode 100644
index 000000000000..c43c4ed28a9c
--- /dev/null
+++ b/arch/x86/kernel/ibt_selftest.S
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <linux/objtool.h>
+#include <asm/nospec-branch.h>
+
+SYM_CODE_START(ibt_selftest_noendbr)
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_FUNC
+ /* #CP handler sets %ax to 0 */
+ RET
+SYM_CODE_END(ibt_selftest_noendbr)
+
+SYM_FUNC_START(ibt_selftest)
+ lea ibt_selftest_noendbr(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rax
+SYM_FUNC_END(ibt_selftest)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index a58c6bc1cd68..b786d48f5a0f 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = {
ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
#endif
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
INTG(X86_TRAP_CP, asm_exc_control_protection),
#endif
@@ -131,7 +131,6 @@ static const __initconst struct idt_data apic_idts[] = {
INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
- INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup),
INTG(REBOOT_VECTOR, asm_sysvec_reboot),
#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9f668d2f3d11..11761c124545 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -49,7 +49,7 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- ack_APIC_irq();
+ apic_eoi();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -256,7 +256,7 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs);
} else {
- ack_APIC_irq();
+ apic_eoi();
if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
@@ -280,7 +280,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
- ack_APIC_irq();
+ apic_eoi();
trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
@@ -310,7 +310,7 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
*/
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(kvm_posted_intr_ipis);
}
@@ -319,7 +319,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(kvm_posted_intr_wakeup_ipis);
kvm_posted_intr_wakeup_handler();
}
@@ -329,7 +329,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
*/
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(kvm_posted_intr_nested_ipis);
}
#endif
@@ -401,6 +401,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- ack_APIC_irq();
+ apic_eoi();
}
#endif
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 890d4778cd35..b0a24deab4a1 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -16,7 +16,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
{
- ack_APIC_irq();
+ apic_eoi();
trace_irq_work_entry(IRQ_WORK_VECTOR);
inc_irq_stat(apic_irq_work_irqs);
irq_work_run();
@@ -28,7 +28,7 @@ void arch_irq_work_raise(void)
if (!arch_irq_work_has_interrupt())
return;
- apic->send_IPI_self(IRQ_WORK_VECTOR);
+ __apic_send_IPI_self(IRQ_WORK_VECTOR);
apic_wait_icr_idle();
}
#endif
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 4eb8f2d19a87..578d16fc040f 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -101,10 +101,8 @@ static void __init jailhouse_get_smp_config(unsigned int early)
register_lapic_address(0xfee00000);
- for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) {
- generic_processor_info(setup_data.v1.cpu_ids[cpu],
- boot_cpu_apic_version);
- }
+ for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++)
+ generic_processor_info(setup_data.v1.cpu_ids[cpu]);
smp_found_config = 1;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f7f6042eb7e6..e8babebad7b8 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,6 +45,7 @@
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <linux/set_memory.h>
+#include <linux/cfi.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -293,7 +294,40 @@ static int can_probe(unsigned long paddr)
#endif
addr += insn.length;
}
+ if (IS_ENABLED(CONFIG_CFI_CLANG)) {
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks and cfi.c decodes this;
+ *
+ *  movl -<id>, %r10d ; 6 bytes
+ * addl -4(%reg), %r10d ; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * Also, these movl and addl are used for showing expected
+ * type. So those must not be touched.
+ */
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return 0;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return 0;
+
+ if (insn.opcode.value == 0xBA)
+ offset = 12;
+ else if (insn.opcode.value == 0x3)
+ offset = 6;
+ else
+ goto out;
+
+ /* This movl/addl is used for decoding CFI. */
+ if (is_cfi_trap(addr + offset))
+ return 0;
+ }
+out:
return (addr == paddr);
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 526d4da3dcd4..b8ab9ee5896c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -291,7 +291,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
struct pt_regs *old_regs = set_irq_regs(regs);
u32 token;
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(irq_hv_callback_count);
@@ -332,7 +332,7 @@ static void kvm_register_steal_time(void)
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
-static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
{
/**
* This relies on __test_and_clear_bit to modify the memory
@@ -343,7 +343,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
*/
if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
- apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
+ apic_native_eoi();
}
static void kvm_guest_cpu_init(void)
@@ -622,10 +622,10 @@ late_initcall(setup_efi_kvm_sev_migration);
/*
* Set the IPI entry points
*/
-static void kvm_setup_pv_ipi(void)
+static __init void kvm_setup_pv_ipi(void)
{
- apic->send_IPI_mask = kvm_send_ipi_mask;
- apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
+ apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
+ apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
pr_info("setup PV IPIs\n");
}
@@ -825,7 +825,7 @@ static void __init kvm_guest_init(void)
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
- apic_set_eoi_write(kvm_guest_apic_eoi_write);
+ apic_update_callback(eoi, kvm_guest_apic_eoi_write);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
static_branch_enable(&kvm_async_pf_enabled);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fed721f90116..b223922248e9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -48,7 +48,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __init MP_processor_info(struct mpc_cpu *m)
{
- int apicid;
char *bootup_cpu = "";
if (!(m->cpuflag & CPU_ENABLED)) {
@@ -56,15 +55,11 @@ static void __init MP_processor_info(struct mpc_cpu *m)
return;
}
- apicid = m->apicid;
-
- if (m->cpuflag & CPU_BOOTPROCESSOR) {
+ if (m->cpuflag & CPU_BOOTPROCESSOR)
bootup_cpu = " (Bootup-CPU)";
- boot_cpu_physical_apicid = m->apicid;
- }
pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
- generic_processor_info(apicid, m->apicver);
+ generic_processor_info(m->apicid);
}
#ifdef CONFIG_X86_IO_APIC
@@ -380,11 +375,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
int i;
/*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
* 2 CPUs, numbered 0 & 1.
*/
processor.type = MP_PROCESSOR;
@@ -525,10 +515,8 @@ void __init default_get_smp_config(unsigned int early)
*/
if (mpf->feature1) {
if (early) {
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ /* Local APIC has default address */
+ register_lapic_address(APIC_DEFAULT_PHYS_BASE);
goto out;
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bb17d37db01..e17c16c54a37 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -39,7 +39,6 @@
#include <asm/cpufeature.h>
#include <asm/msr.h>
-static struct class *msr_class;
static enum cpuhp_state cpuhp_msr_state;
enum allow_write_msrs {
@@ -235,26 +234,31 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
+static char *msr_devnode(const struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
+}
+
+static const struct class msr_class = {
+ .name = "msr",
+ .devnode = msr_devnode,
+};
+
static int msr_device_create(unsigned int cpu)
{
struct device *dev;
- dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
+ dev = device_create(&msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
"msr%d", cpu);
return PTR_ERR_OR_ZERO(dev);
}
static int msr_device_destroy(unsigned int cpu)
{
- device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ device_destroy(&msr_class, MKDEV(MSR_MAJOR, cpu));
return 0;
}
-static char *msr_devnode(const struct device *dev, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
-}
-
static int __init msr_init(void)
{
int err;
@@ -263,12 +267,9 @@ static int __init msr_init(void)
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
return -EBUSY;
}
- msr_class = class_create("msr");
- if (IS_ERR(msr_class)) {
- err = PTR_ERR(msr_class);
+ err = class_register(&msr_class);
+ if (err)
goto out_chrdev;
- }
- msr_class->devnode = msr_devnode;
err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
msr_device_create, msr_device_destroy);
@@ -278,7 +279,7 @@ static int __init msr_init(void)
return 0;
out_class:
- class_destroy(msr_class);
+ class_unregister(&msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
return err;
@@ -288,7 +289,7 @@ module_init(msr_init);
static void __exit msr_exit(void)
{
cpuhp_remove_state(cpuhp_msr_state);
- class_destroy(msr_class);
+ class_unregister(&msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
}
module_exit(msr_exit)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index a1a96df3dff1..e93a8545c74d 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -75,7 +75,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
/* sync above data before sending NMI */
wmb();
- apic->send_IPI_mask(mask, NMI_VECTOR);
+ __apic_send_IPI_mask(mask, NMI_VECTOR);
/* Don't wait longer than a second */
timeout = USEC_PER_SEC;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de6be0a3965e..f323d83e40a7 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -72,9 +72,15 @@ static inline void __init pci_swiotlb_detect(void)
#endif /* CONFIG_SWIOTLB */
#ifdef CONFIG_SWIOTLB_XEN
+static bool xen_swiotlb_enabled(void)
+{
+ return xen_initial_domain() || x86_swiotlb_enable ||
+ (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible);
+}
+
static void __init pci_xen_swiotlb_init(void)
{
- if (!xen_initial_domain() && !x86_swiotlb_enable)
+ if (!xen_swiotlb_enabled())
return;
x86_swiotlb_enable = true;
x86_swiotlb_flags |= SWIOTLB_ANY;
@@ -83,27 +89,6 @@ static void __init pci_xen_swiotlb_init(void)
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
}
-
-int pci_xen_swiotlb_init_late(void)
-{
- if (dma_ops == &xen_swiotlb_dma_ops)
- return 0;
-
- /* we can work with the default swiotlb */
- if (!io_tlb_default_mem.nslabs) {
- int rc = swiotlb_init_late(swiotlb_size_or_default(),
- GFP_KERNEL, xen_swiotlb_fixup);
- if (rc < 0)
- return rc;
- }
-
- /* XXX: this switches the dma ops under live devices! */
- dma_ops = &xen_swiotlb_dma_ops;
- if (IS_ENABLED(CONFIG_PCI))
- pci_request_acs();
- return 0;
-}
-EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
#else
static inline void __init pci_xen_swiotlb_init(void)
{
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 72015dba72ab..9f0909142a0a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -51,6 +51,7 @@
#include <asm/unwind.h>
#include <asm/tdx.h>
#include <asm/mmu_context.h>
+#include <asm/shstk.h>
#include "process.h"
@@ -122,6 +123,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
+ shstk_free(tsk);
fpu__drop(fpu);
}
@@ -162,6 +164,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
struct pt_regs *childregs;
+ unsigned long new_ssp;
int ret = 0;
childregs = task_pt_regs(p);
@@ -199,7 +202,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame->flags = X86_EFLAGS_FIXED;
#endif
- fpu_clone(p, clone_flags, args->fn);
+ /*
+ * Allocate a new shadow stack for thread if needed. If shadow stack,
+ * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
+ * update it.
+ */
+ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+ if (IS_ERR_VALUE(new_ssp))
+ return PTR_ERR((void *)new_ssp);
+
+ fpu_clone(p, clone_flags, args->fn, new_ssp);
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
@@ -245,6 +257,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
+ /*
+ * If copy_thread() if failing, don't leak the shadow stack possibly
+ * allocated in shstk_alloc_thread_stack() above.
+ */
+ if (ret)
+ shstk_free(p);
+
return ret;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d181c16a2f6..33b268747bb7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -515,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
load_gs_index(__USER_DS);
}
+ reset_thread_features();
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
@@ -894,6 +896,12 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
else
return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
#endif
+ case ARCH_SHSTK_ENABLE:
+ case ARCH_SHSTK_DISABLE:
+ case ARCH_SHSTK_LOCK:
+ case ARCH_SHSTK_UNLOCK:
+ case ARCH_SHSTK_STATUS:
+ return shstk_prctl(task, option, arg2);
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index dfaa270a7cc9..095f04bdabdc 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -58,6 +58,7 @@ enum x86_regset_64 {
REGSET64_FP,
REGSET64_IOPERM,
REGSET64_XSTATE,
+ REGSET64_SSP,
};
#define REGSET_GENERAL \
@@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.active = ioperm_active,
.regset_get = ioperm_get
},
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [REGSET64_SSP] = {
+ .core_note_type = NT_X86_SHSTK,
+ .n = 1,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = ssp_active,
+ .regset_get = ssp_get,
+ .set = ssp_set
+ },
+#endif
};
static const struct user_regset_view user_x86_64_view = {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fd975a4a5200..b9145a63da77 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,7 +114,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
-unsigned int def_to_bigsmp;
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
@@ -1018,9 +1017,11 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
+ apic_setup_apic_calls();
+
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
- disable_apic = 1;
+ apic_is_disabled = true;
#endif
setup_clear_cpu_cap(X86_FEATURE_APIC);
}
@@ -1253,7 +1254,7 @@ void __init setup_arch(char **cmdline_p)
map_vsyscall();
- generic_apic_probe();
+ x86_32_probe_apic();
early_quirks();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index c242dc47e9cb..2c97bf7b56ae 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -181,15 +181,9 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_LOCAL_APIC
per_cpu(x86_cpu_to_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_apicid, cpu);
- per_cpu(x86_bios_cpu_apicid, cpu) =
- early_per_cpu_map(x86_bios_cpu_apicid, cpu);
per_cpu(x86_cpu_to_acpiid, cpu) =
early_per_cpu_map(x86_cpu_to_acpiid, cpu);
#endif
-#ifdef CONFIG_X86_32
- per_cpu(x86_cpu_to_logical_apicid, cpu) =
- early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
-#endif
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -214,12 +208,8 @@ void __init setup_per_cpu_areas(void)
/* indicate the early static arrays will soon be gone */
#ifdef CONFIG_X86_LOCAL_APIC
early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
- early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
#endif
-#ifdef CONFIG_X86_32
- early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
-#endif
#ifdef CONFIG_NUMA
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index d380c9399480..2787826d9f60 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -1089,7 +1089,7 @@ static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip)
return ret;
}
-void snp_set_wakeup_secondary_cpu(void)
+void __init snp_set_wakeup_secondary_cpu(void)
{
if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return;
@@ -1099,7 +1099,7 @@ void snp_set_wakeup_secondary_cpu(void)
* required method to start APs under SNP. If the hypervisor does
* not support AP creation, then no APs will be started.
*/
- apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit;
+ apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
}
int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index 000000000000..fd689921a1db
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/shstk.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
+#include <asm/prctl.h>
+
+#define SS_FRAME_SIZE 8
+
+static bool features_enabled(unsigned long features)
+{
+ return current->thread.features & features;
+}
+
+static void features_set(unsigned long features)
+{
+ current->thread.features |= features;
+}
+
+static void features_clr(unsigned long features)
+{
+ current->thread.features &= ~features;
+}
+
+/*
+ * Create a restore token on the shadow stack. A token is always 8-byte
+ * and aligned to 8.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+ unsigned long addr;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ addr = ssp - SS_FRAME_SIZE;
+
+ /*
+ * SSP is aligned, so reserved bits and mode bit are a zero, just mark
+ * the token 64-bit.
+ */
+ ssp |= BIT(0);
+
+ if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
+ return -EFAULT;
+
+ if (token_addr)
+ *token_addr = addr;
+
+ return 0;
+}
+
+/*
+ * VM_SHADOW_STACK will have a guard page. This helps userspace protect
+ * itself from attacks. The reasoning is as follows:
+ *
+ * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
+ * INCSSP instruction can increment the shadow stack pointer. It is the
+ * shadow stack analog of an instruction like:
+ *
+ * addq $0x80, %rsp
+ *
+ * However, there is one important difference between an ADD on %rsp
+ * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
+ * memory of the first and last elements that were "popped". It can be
+ * thought of as acting like this:
+ *
+ * READ_ONCE(ssp); // read+discard top element on stack
+ * ssp += nr_to_pop * 8; // move the shadow stack
+ * READ_ONCE(ssp-8); // read+discard last popped stack element
+ *
+ * The maximum distance INCSSP can move the SSP is 2040 bytes, before
+ * it would read the memory. Therefore a single page gap will be enough
+ * to prevent any operation from shifting the SSP to an adjacent stack,
+ * since it would have to land in the gap at least once, causing a
+ * fault.
+ */
+static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
+ goto out;
+
+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
+ vm_munmap(mapped_addr, size);
+ return -EINVAL;
+ }
+
+out:
+ return mapped_addr;
+}
+
+static unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * mmap_write_lock_killable() failed with -EINTR. This means
+ * the process is about to die and have it's MM cleaned up.
+ * This task shouldn't ever make it back to userspace. In this
+ * case it is ok to leak a shadow stack, so just exit out.
+ */
+ if (r == -EINTR)
+ return;
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+}
+
+static int shstk_setup(void)
+{
+ struct thread_shstk *shstk = &current->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /* Also not supported for 32 bit and x32 */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+
+ shstk->base = addr;
+ shstk->size = size;
+ features_set(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
+void reset_thread_features(void)
+{
+ memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
+ current->thread.features = 0;
+ current->thread.features_locked = 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
+ unsigned long stack_size)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long addr, size;
+
+ /*
+ * If shadow stack is not enabled on the new thread, skip any
+ * switch to a new shadow stack.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /*
+ * For CLONE_VM, except vfork, the child needs a separate shadow
+ * stack.
+ */
+ if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ return 0;
+
+ size = adjust_shstk_size(stack_size);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ shstk->base = addr;
+ shstk->size = size;
+
+ return addr + size;
+}
+
+static unsigned long get_user_shstk_addr(void)
+{
+ unsigned long long ssp;
+
+ fpregs_lock_and_load();
+
+ rdmsrl(MSR_IA32_PL3_SSP, ssp);
+
+ fpregs_unlock();
+
+ return ssp;
+}
+
+#define SHSTK_DATA_BIT BIT(63)
+
+static int put_shstk_data(u64 __user *addr, u64 data)
+{
+ if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ /*
+ * Mark the high bit so that the sigframe can't be processed as a
+ * return address.
+ */
+ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
+ return -EFAULT;
+ return 0;
+}
+
+static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
+{
+ unsigned long ldata;
+
+ if (unlikely(get_user(ldata, addr)))
+ return -EFAULT;
+
+ if (!(ldata & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ *data = ldata & ~SHSTK_DATA_BIT;
+
+ return 0;
+}
+
+static int shstk_push_sigframe(unsigned long *ssp)
+{
+ unsigned long target_ssp = *ssp;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(target_ssp, 8))
+ return -EINVAL;
+
+ *ssp -= SS_FRAME_SIZE;
+ if (put_shstk_data((void __user *)*ssp, target_ssp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int shstk_pop_sigframe(unsigned long *ssp)
+{
+ struct vm_area_struct *vma;
+ unsigned long token_addr;
+ bool need_to_check_vma;
+ int err = 1;
+
+ /*
+ * It is possible for the SSP to be off the end of a shadow stack by 4
+ * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
+ * before it, it might be this case, so check that the address being
+ * read is actually shadow stack.
+ */
+ if (!IS_ALIGNED(*ssp, 8))
+ return -EINVAL;
+
+ need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
+
+ if (need_to_check_vma)
+ mmap_read_lock_killable(current->mm);
+
+ err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
+ if (unlikely(err))
+ goto out_err;
+
+ if (need_to_check_vma) {
+ vma = find_vma(current->mm, *ssp);
+ if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
+ err = -EFAULT;
+ goto out_err;
+ }
+
+ mmap_read_unlock(current->mm);
+ }
+
+ /* Restore SSP aligned? */
+ if (unlikely(!IS_ALIGNED(token_addr, 8)))
+ return -EINVAL;
+
+ /* SSP in userspace? */
+ if (unlikely(token_addr >= TASK_SIZE_MAX))
+ return -EINVAL;
+
+ *ssp = token_addr;
+
+ return 0;
+out_err:
+ if (need_to_check_vma)
+ mmap_read_unlock(current->mm);
+ return err;
+}
+
+int setup_signal_shadow_stack(struct ksignal *ksig)
+{
+ void __user *restorer = ksig->ka.sa.sa_restorer;
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ if (!restorer)
+ return -EINVAL;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_push_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ /* Push restorer address */
+ ssp -= SS_FRAME_SIZE;
+ err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
+ if (unlikely(err))
+ return -EFAULT;
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+int restore_signal_shadow_stack(void)
+{
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_pop_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already has a
+ * shadow stack allocated, and exit_thread() calls this function to
+ * free it. In this case the parent (current) and the child share
+ * the same mm struct.
+ */
+ if (!tsk->mm || tsk->mm != current->mm)
+ return;
+
+ unmap_shadow_stack(shstk->base, shstk->size);
+}
+
+static int wrss_control(bool enable)
+{
+ u64 msrval;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /*
+ * Only enable WRSS if shadow stack is enabled. If shadow stack is not
+ * enabled, WRSS will already be disabled, so don't bother clearing it
+ * when disabling.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -EPERM;
+
+ /* Already enabled/disabled? */
+ if (features_enabled(ARCH_SHSTK_WRSS) == enable)
+ return 0;
+
+ fpregs_lock_and_load();
+ rdmsrl(MSR_IA32_U_CET, msrval);
+
+ if (enable) {
+ features_set(ARCH_SHSTK_WRSS);
+ msrval |= CET_WRSS_EN;
+ } else {
+ features_clr(ARCH_SHSTK_WRSS);
+ if (!(msrval & CET_WRSS_EN))
+ goto unlock;
+
+ msrval &= ~CET_WRSS_EN;
+ }
+
+ wrmsrl(MSR_IA32_U_CET, msrval);
+
+unlock:
+ fpregs_unlock();
+
+ return 0;
+}
+
+static int shstk_disable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /* Already disabled? */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ fpregs_lock_and_load();
+ /* Disable WRSS too when disabling shadow stack */
+ wrmsrl(MSR_IA32_U_CET, 0);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ fpregs_unlock();
+
+ shstk_free(current);
+ features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+ unsigned long aligned_size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ if (flags & ~SHADOW_STACK_SET_TOKEN)
+ return -EINVAL;
+
+ /* If there isn't space for a token */
+ if (set_tok && size < 8)
+ return -ENOSPC;
+
+ if (addr && addr < SZ_4G)
+ return -ERANGE;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ aligned_size = PAGE_ALIGN(size);
+ if (aligned_size < size)
+ return -EOVERFLOW;
+
+ return alloc_shstk(addr, aligned_size, size, set_tok);
+}
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
+{
+ unsigned long features = arg2;
+
+ if (option == ARCH_SHSTK_STATUS) {
+ return put_user(task->thread.features, (unsigned long __user *)arg2);
+ }
+
+ if (option == ARCH_SHSTK_LOCK) {
+ task->thread.features_locked |= features;
+ return 0;
+ }
+
+ /* Only allow via ptrace */
+ if (task != current) {
+ if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
+ task->thread.features_locked &= ~features;
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ /* Do not allow to change locked features */
+ if (features & task->thread.features_locked)
+ return -EPERM;
+
+ /* Only support enabling/disabling one feature at a time. */
+ if (hweight_long(features) > 1)
+ return -EINVAL;
+
+ if (option == ARCH_SHSTK_DISABLE) {
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(false);
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_disable();
+ return -EINVAL;
+ }
+
+ /* Handle ARCH_SHSTK_ENABLE */
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_setup();
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(true);
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cfeec3ee877e..65fe2094da59 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -40,6 +40,7 @@
#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
+#include <asm/shstk.h>
static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9027fc088f97..c12624bc82a3 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -402,7 +402,7 @@ Efault:
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 13a1e6083837..cacf2ede6217 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
@@ -403,7 +409,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 7eb18ca7bd45..6eb06d001bcc 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -135,7 +135,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
- ack_APIC_irq();
+ apic_eoi();
cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
}
@@ -237,7 +237,7 @@ static void native_stop_other_cpus(int wait)
pr_emerg("Shutting down cpus with NMI\n");
for_each_cpu(cpu, &cpus_stop_mask)
- apic->send_IPI(cpu, NMI_VECTOR);
+ __apic_send_IPI(cpu, NMI_VECTOR);
}
/*
* Don't wait longer than 10 ms if the caller didn't
@@ -268,7 +268,7 @@ done:
*/
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
- ack_APIC_irq();
+ apic_eoi();
trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
scheduler_ipi();
@@ -277,7 +277,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
- ack_APIC_irq();
+ apic_eoi();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_interrupt();
@@ -286,7 +286,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
- ack_APIC_irq();
+ apic_eoi();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_single_interrupt();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d40ed3a7dc23..d7667a29acf3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -414,7 +414,7 @@ found:
return 0;
}
-void __init smp_store_boot_cpu_info(void)
+static void __init smp_store_boot_cpu_info(void)
{
int id = 0; /* CPU 0 */
struct cpuinfo_x86 *c = &cpu_data(id);
@@ -761,44 +761,6 @@ static void impress_friends(void)
pr_debug("Before bogocount - setting activated=1\n");
}
-void __inquire_remote_apic(int apicid)
-{
- unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- const char * const names[] = { "ID", "VERSION", "SPIV" };
- int timeout;
- u32 status;
-
- pr_info("Inquiring remote APIC 0x%x...\n", apicid);
-
- for (i = 0; i < ARRAY_SIZE(regs); i++) {
- pr_info("... APIC 0x%x %s: ", apicid, names[i]);
-
- /*
- * Wait for idle.
- */
- status = safe_apic_wait_icr_idle();
- if (status)
- pr_cont("a previous APIC delivery may have failed\n");
-
- apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
-
- timeout = 0;
- do {
- udelay(100);
- status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
- } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
- switch (status) {
- case APIC_ICR_RR_VALID:
- status = apic_read(APIC_RRR);
- pr_cont("%08x\n", status);
- break;
- default:
- pr_cont("failed\n");
- }
- }
-}
-
/*
* The Multiprocessor Specification 1.4 (1997) example code suggests
* that there should be a 10ms delay between the BSP asserting INIT
@@ -1089,9 +1051,8 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- if (apicid == BAD_APICID ||
- !physid_isset(apicid, phys_cpu_present_map) ||
- !apic->apic_id_valid(apicid)) {
+ if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) ||
+ !apic_id_valid(apicid)) {
pr_err("%s: bad cpu %d\n", __func__, cpu);
return -EINVAL;
}
@@ -1174,58 +1135,6 @@ static __init void disable_smp(void)
cpumask_set_cpu(0, topology_die_cpumask(0));
}
-/*
- * Various sanity checks.
- */
-static void __init smp_sanity_check(void)
-{
- preempt_disable();
-
-#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
- if (def_to_bigsmp && nr_cpu_ids > 8) {
- unsigned int cpu;
- unsigned nr;
-
- pr_warn("More than 8 CPUs detected - skipping them\n"
- "Use CONFIG_X86_BIGSMP\n");
-
- nr = 0;
- for_each_present_cpu(cpu) {
- if (nr >= 8)
- set_cpu_present(cpu, false);
- nr++;
- }
-
- nr = 0;
- for_each_possible_cpu(cpu) {
- if (nr >= 8)
- set_cpu_possible(cpu, false);
- nr++;
- }
-
- set_nr_cpu_ids(8);
- }
-#endif
-
- if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
- hard_smp_processor_id());
-
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
-
- /*
- * Should not be necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- */
- if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
- pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
- boot_cpu_physical_apicid);
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
- preempt_enable();
-}
-
static void __init smp_cpu_index_default(void)
{
int i;
@@ -1285,8 +1194,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
smp_prepare_cpus_common();
- smp_sanity_check();
-
switch (apic_intr_mode) {
case APIC_PIC:
case APIC_VIRTUAL_WIRE_NO_CONFIG:
@@ -1422,24 +1329,6 @@ __init void prefill_possible_map(void)
{
int i, possible;
- /* No boot processor was found in mptable or ACPI MADT */
- if (!num_processors) {
- if (boot_cpu_has(X86_FEATURE_APIC)) {
- int apicid = boot_cpu_physical_apicid;
- int cpu = hard_smp_processor_id();
-
- pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
-
- /* Make sure boot cpu is enumerated */
- if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
- apic->apic_id_valid(apicid))
- generic_processor_info(apicid, boot_cpu_apic_version);
- }
-
- if (!num_processors)
- num_processors = 1;
- }
-
i = setup_max_cpus ?: 1;
if (setup_possible_cpus == -1) {
possible = num_processors;
@@ -1601,9 +1490,7 @@ void play_dead_common(void)
idle_task_exit();
cpuhp_ap_report_dead();
- /*
- * With physical CPU hotplug, we should halt the cpu
- */
+
local_irq_disable();
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8cc653ffdccd..c783aeb37dce 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -193,7 +193,11 @@ get_unmapped_area:
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
+ info.low_limit = SZ_4G;
+ else
+ info.low_limit = PAGE_SIZE;
+
info.high_limit = get_mmap_base(0);
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4a817d20ce3b..c876f1d36a81 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -77,18 +77,6 @@
DECLARE_BITMAP(system_vectors, NR_VECTORS);
-static inline void cond_local_irq_enable(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
-
-static inline void cond_local_irq_disable(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
-}
-
__always_inline int is_valid_bugaddr(unsigned long addr)
{
if (addr < TASK_SIZE_MAX)
@@ -213,81 +201,6 @@ DEFINE_IDTENTRY(exc_overflow)
do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
-#ifdef CONFIG_X86_KERNEL_IBT
-
-static __ro_after_init bool ibt_fatal = true;
-
-extern void ibt_selftest_ip(void); /* code label defined in asm below */
-
-enum cp_error_code {
- CP_EC = (1 << 15) - 1,
-
- CP_RET = 1,
- CP_IRET = 2,
- CP_ENDBR = 3,
- CP_RSTRORSSP = 4,
- CP_SETSSBSY = 5,
-
- CP_ENCL = 1 << 15,
-};
-
-DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
-{
- if (!cpu_feature_enabled(X86_FEATURE_IBT)) {
- pr_err("Unexpected #CP\n");
- BUG();
- }
-
- if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR))
- return;
-
- if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) {
- regs->ax = 0;
- return;
- }
-
- pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
- if (!ibt_fatal) {
- printk(KERN_DEFAULT CUT_HERE);
- __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
- return;
- }
- BUG();
-}
-
-/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */
-noinline bool ibt_selftest(void)
-{
- unsigned long ret;
-
- asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t"
- ANNOTATE_RETPOLINE_SAFE
- " jmp *%%rax\n\t"
- "ibt_selftest_ip:\n\t"
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
- " nop\n\t"
-
- : "=a" (ret) : : "memory");
-
- return !ret;
-}
-
-static int __init ibt_setup(char *str)
-{
- if (!strcmp(str, "off"))
- setup_clear_cpu_cap(X86_FEATURE_IBT);
-
- if (!strcmp(str, "warn"))
- ibt_fatal = false;
-
- return 1;
-}
-
-__setup("ibt=", ibt_setup);
-
-#endif /* CONFIG_X86_KERNEL_IBT */
-
#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 796cfaa46bfa..65e96b76c423 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -129,7 +129,7 @@ static void __init vsmp_cap_cpus(void)
static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
{
- return hard_smp_processor_id() >> index_msb;
+ return read_apic_id() >> index_msb;
}
static void vsmp_apic_post_init(void)
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 94c38bea60e7..af662312fd07 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -175,7 +175,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
* scheduled out).
*/
if (pi_test_on(&new))
- apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
+ __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
local_irq_restore(flags);
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index df461f387e20..b483a8baaacf 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4179,7 +4179,7 @@ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/
if (vcpu != kvm_get_running_vcpu())
- apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return;
}
#endif
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 01c5de4c279b..0a81aafed7f8 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -27,7 +27,7 @@
* NOTE! The calling convention is very intentionally the same as
* for 'rep movs', so that we can rewrite the function call with
* just a plain 'rep movs' on machines that have FSRM. But to make
- * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
+ * it simpler for us, we can clobber rsi/rdi and rax freely.
*/
SYM_FUNC_START(rep_movs_alternative)
cmpq $64,%rcx
@@ -68,55 +68,24 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
.Llarge:
-0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS
+0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
1: RET
- _ASM_EXTABLE_UA( 0b, 1b)
+ _ASM_EXTABLE_UA( 0b, 1b)
- .p2align 4
-.Lunrolled:
-10: movq (%rsi),%r8
-11: movq 8(%rsi),%r9
-12: movq 16(%rsi),%r10
-13: movq 24(%rsi),%r11
-14: movq %r8,(%rdi)
-15: movq %r9,8(%rdi)
-16: movq %r10,16(%rdi)
-17: movq %r11,24(%rdi)
-20: movq 32(%rsi),%r8
-21: movq 40(%rsi),%r9
-22: movq 48(%rsi),%r10
-23: movq 56(%rsi),%r11
-24: movq %r8,32(%rdi)
-25: movq %r9,40(%rdi)
-26: movq %r10,48(%rdi)
-27: movq %r11,56(%rdi)
- addq $64,%rsi
- addq $64,%rdi
- subq $64,%rcx
- cmpq $64,%rcx
- jae .Lunrolled
- cmpl $8,%ecx
- jae .Lword
+.Llarge_movsq:
+ movq %rcx,%rax
+ shrq $3,%rcx
+ andl $7,%eax
+0: rep movsq
+ movl %eax,%ecx
testl %ecx,%ecx
jne .Lcopy_user_tail
RET
- _ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
- _ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
+1: leaq (%rax,%rcx,8),%rcx
+ jmp .Lcopy_user_tail
+
+ _ASM_EXTABLE_UA( 0b, 1b)
SYM_FUNC_END(rep_movs_alternative)
EXPORT_SYMBOL(rep_movs_alternative)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e8711b2cafaf..ab778eac1952 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1112,8 +1112,22 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
(error_code & X86_PF_INSTR), foreign))
return 1;
+ /*
+ * Shadow stack accesses (PF_SHSTK=1) are only permitted to
+ * shadow stack VMAs. All other accesses result in an error.
+ */
+ if (error_code & X86_PF_SHSTK) {
+ if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
+ return 1;
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ return 0;
+ }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
+ if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
+ return 1;
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
return 0;
@@ -1305,6 +1319,14 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * Read-only permissions can not be expressed in shadow stack PTEs.
+ * Treat all shadow stack accesses as WRITE faults. This ensures
+ * that the MM will prepare everything (e.g., break COW) such that
+ * maybe_mkwrite() can create a proper shadow stack PTE.
+ */
+ if (error_code & X86_PF_SHSTK)
+ flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
@@ -1328,7 +1350,6 @@ void do_user_addr_fault(struct pt_regs *regs,
}
#endif
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
@@ -1341,7 +1362,8 @@ void do_user_addr_fault(struct pt_regs *regs,
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
@@ -1358,7 +1380,6 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
retry:
vma = lock_mm_and_find_vma(mm, address, regs);
@@ -1418,9 +1439,7 @@ retry:
}
mmap_read_unlock(mm);
-#ifdef CONFIG_PER_VMA_LOCK
done:
-#endif
if (likely(!(fault & VM_FAULT_ERROR)))
return;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ffa25e962343..679893ea5e68 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -306,15 +306,6 @@ static void setup_pcid(void)
* start_secondary().
*/
cr4_set_bits(X86_CR4_PCIDE);
-
- /*
- * INVPCID's single-context modes (2/3) only work if we set
- * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
- * on systems that have X86_CR4_PCIDE clear, or that have
- * no INVPCID support at all.
- */
- if (boot_cpu_has(X86_FEATURE_INVPCID))
- setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
} else {
/*
* flush_tlb_all(), as currently implemented, won't work if
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index df4182b6449f..bda9f129835e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2074,12 +2074,12 @@ int set_memory_nx(unsigned long addr, int numpages)
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
}
int set_memory_rox(unsigned long addr, int numpages)
{
- pgprot_t clr = __pgprot(_PAGE_RW);
+ pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
if (__supported_pte_mask & _PAGE_NX)
clr.pgprot |= _PAGE_NX;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 15a8009a4480..9deadf517f14 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,7 +52,7 @@ early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
- pgtable_pte_page_dtor(pte);
+ pagetable_pte_dtor(page_ptdesc(pte));
paravirt_release_pte(page_to_pfn(pte));
paravirt_tlb_remove_table(tlb, pte);
}
@@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
- struct page *page = virt_to_page(pmd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
@@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
- pgtable_pmd_page_dtor(page);
- paravirt_tlb_remove_table(tlb, page);
+ pagetable_pmd_dtor(ptdesc);
+ paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc));
}
#if CONFIG_PGTABLE_LEVELS > 3
@@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
static inline void pgd_list_add(pgd_t *pgd)
{
- struct page *page = virt_to_page(pgd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
- list_add(&page->lru, &pgd_list);
+ list_add(&ptdesc->pt_list, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
- struct page *page = virt_to_page(pgd);
+ struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
- list_del(&page->lru);
+ list_del(&ptdesc->pt_list);
}
#define UNSHARED_PTRS_PER_PGD \
@@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
- virt_to_page(pgd)->pt_mm = mm;
+ virt_to_ptdesc(pgd)->pt_mm = mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
- return page->pt_mm;
+ return page_ptdesc(page)->pt_mm;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
@@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
{
int i;
+ struct ptdesc *ptdesc;
for (i = 0; i < count; i++)
if (pmds[i]) {
- pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
- free_page((unsigned long)pmds[i]);
+ ptdesc = virt_to_ptdesc(pmds[i]);
+
+ pagetable_pmd_dtor(ptdesc);
+ pagetable_free(ptdesc);
mm_dec_nr_pmds(mm);
}
}
@@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
+ gfp &= ~__GFP_HIGHMEM;
for (i = 0; i < count; i++) {
- pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
- if (!pmd)
+ pmd_t *pmd = NULL;
+ struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
+
+ if (!ptdesc)
failed = true;
- if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
- free_page((unsigned long)pmd);
- pmd = NULL;
+ if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
+ ptdesc = NULL;
failed = true;
}
- if (pmd)
+ if (ptdesc) {
mm_inc_nr_pmds(mm);
+ pmd = ptdesc_address(ptdesc);
+ }
+
pmds[i] = pmd;
}
@@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
free_page((unsigned long)pmd_sv);
- pgtable_pmd_page_dtor(virt_to_page(pmd));
+ pagetable_pmd_dtor(virt_to_ptdesc(pmd));
free_page((unsigned long)pmd);
return 1;
@@ -872,3 +881,43 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return pte_mkwrite_shstk(pte);
+
+ pte = pte_mkwrite_novma(pte);
+
+ return pte_clear_saveddirty(pte);
+}
+
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return pmd_mkwrite_shstk(pmd);
+
+ pmd = pmd_mkwrite_novma(pmd);
+
+ return pmd_clear_saveddirty(pmd);
+}
+
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
+{
+ /*
+ * Hardware before shadow stack can (rarely) set Dirty=1
+ * on a Write=0 PTE. So the below condition
+ * only indicates a software bug when shadow stack is
+ * supported by the HW. This checking is covered in
+ * pte_shstk().
+ */
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
+ pte_shstk(pte));
+}
+
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
+{
+ /* See note in arch_check_zapped_pte() */
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
+ pmd_shstk(pmd));
+}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index dac07e4f5834..9c52a95937ad 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -40,9 +40,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
return;
pxm = pa->proximity_domain;
apic_id = pa->apic_id;
- if (!apic->apic_id_valid(apic_id)) {
- printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
- pxm, apic_id);
+ if (!apic_id_valid(apic_id)) {
+ pr_info("SRAT: PXM %u -> X2APIC 0x%04x ignored\n", pxm, apic_id);
return;
}
node = acpi_map_pxm_to_node(pxm);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 267acf27480a..453ea95b667d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/sched/smt.h>
#include <linux/task_work.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_flush_tlb_info();
put_cpu();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
@@ -1140,21 +1142,28 @@ void flush_tlb_one_kernel(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
{
- u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u32 loaded_mm_asid;
+ bool cpu_pcide;
+ /* Flush 'addr' from the kernel PCID: */
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /* If PTI is off there is no user PCID and nothing to flush. */
if (!static_cpu_has(X86_FEATURE_PTI))
return;
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE;
+
/*
- * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
- * Just use invalidate_user_asid() in case we are called early.
+ * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check
+ * 'cpu_pcide' to ensure that *this* CPU will not trigger those
+ * #GP's even if called before CR4.PCIDE has been initialized.
*/
- if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
- invalidate_user_asid(loaded_mm_asid);
- else
+ if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+ else
+ invalidate_user_asid(loaded_mm_asid);
}
void flush_tlb_one_user(unsigned long addr)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 438adb695daa..a5930042139d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -701,6 +701,38 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+static void emit_movsx_reg(u8 **pprog, int num_bits, bool is64, u32 dst_reg,
+ u32 src_reg)
+{
+ u8 *prog = *pprog;
+
+ if (is64) {
+ /* movs[b,w,l]q dst, src */
+ if (num_bits == 8)
+ EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbe,
+ add_2reg(0xC0, src_reg, dst_reg));
+ else if (num_bits == 16)
+ EMIT4(add_2mod(0x48, src_reg, dst_reg), 0x0f, 0xbf,
+ add_2reg(0xC0, src_reg, dst_reg));
+ else if (num_bits == 32)
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x63,
+ add_2reg(0xC0, src_reg, dst_reg));
+ } else {
+ /* movs[b,w]l dst, src */
+ if (num_bits == 8) {
+ EMIT4(add_2mod(0x40, src_reg, dst_reg), 0x0f, 0xbe,
+ add_2reg(0xC0, src_reg, dst_reg));
+ } else if (num_bits == 16) {
+ if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, src_reg, dst_reg));
+ EMIT3(add_2mod(0x0f, src_reg, dst_reg), 0xbf,
+ add_2reg(0xC0, src_reg, dst_reg));
+ }
+ }
+
+ *pprog = prog;
+}
+
/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
@@ -779,6 +811,29 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
+/* LDSX: dst_reg = *(s8*)(src_reg + off) */
+static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
+{
+ u8 *prog = *pprog;
+
+ switch (size) {
+ case BPF_B:
+ /* Emit 'movsx rax, byte ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBE);
+ break;
+ case BPF_H:
+ /* Emit 'movsx rax, word ptr [rax + off]' */
+ EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xBF);
+ break;
+ case BPF_W:
+ /* Emit 'movsx rax, dword ptr [rax+0x14]' */
+ EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x63);
+ break;
+ }
+ emit_insn_suffix(&prog, src_reg, dst_reg, off);
+ *pprog = prog;
+}
+
/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -1028,9 +1083,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
case BPF_ALU64 | BPF_MOV | BPF_X:
case BPF_ALU | BPF_MOV | BPF_X:
- emit_mov_reg(&prog,
- BPF_CLASS(insn->code) == BPF_ALU64,
- dst_reg, src_reg);
+ if (insn->off == 0)
+ emit_mov_reg(&prog,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
+ else
+ emit_movsx_reg(&prog, insn->off,
+ BPF_CLASS(insn->code) == BPF_ALU64,
+ dst_reg, src_reg);
break;
/* neg dst */
@@ -1134,15 +1194,26 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
/* mov rax, dst_reg */
emit_mov_reg(&prog, is64, BPF_REG_0, dst_reg);
- /*
- * xor edx, edx
- * equivalent to 'xor rdx, rdx', but one byte less
- */
- EMIT2(0x31, 0xd2);
+ if (insn->off == 0) {
+ /*
+ * xor edx, edx
+ * equivalent to 'xor rdx, rdx', but one byte less
+ */
+ EMIT2(0x31, 0xd2);
- /* div src_reg */
- maybe_emit_1mod(&prog, src_reg, is64);
- EMIT2(0xF7, add_1reg(0xF0, src_reg));
+ /* div src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF0, src_reg));
+ } else {
+ if (BPF_CLASS(insn->code) == BPF_ALU)
+ EMIT1(0x99); /* cdq */
+ else
+ EMIT2(0x48, 0x99); /* cqo */
+
+ /* idiv src_reg */
+ maybe_emit_1mod(&prog, src_reg, is64);
+ EMIT2(0xF7, add_1reg(0xF8, src_reg));
+ }
if (BPF_OP(insn->code) == BPF_MOD &&
dst_reg != BPF_REG_3)
@@ -1262,6 +1333,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
break;
case BPF_ALU | BPF_END | BPF_FROM_BE:
+ case BPF_ALU64 | BPF_END | BPF_FROM_LE:
switch (imm32) {
case 16:
/* Emit 'ror %ax, 8' to swap lower 2 bytes */
@@ -1370,9 +1442,17 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ /* LDXS: dst_reg = *(s8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEMSX | BPF_B:
+ case BPF_LDX | BPF_MEMSX | BPF_H:
+ case BPF_LDX | BPF_MEMSX | BPF_W:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
+ case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
insn_off = insn->off;
- if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
/* Conservatively check that src_reg + insn->off is a kernel address:
* src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
* src_reg is used as scratch for src_reg += insn->off and restored
@@ -1415,8 +1495,13 @@ st: if (is_imm8(insn->off))
start_of_ldx = prog;
end_of_jmp[-1] = start_of_ldx - end_of_jmp;
}
- emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
- if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
+ BPF_MODE(insn->code) == BPF_MEMSX)
+ emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+ else
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+ BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
@@ -1730,16 +1815,24 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
break;
case BPF_JMP | BPF_JA:
- if (insn->off == -1)
- /* -1 jmp instructions will always jump
- * backwards two bytes. Explicitly handling
- * this case avoids wasting too many passes
- * when there are long sequences of replaced
- * dead code.
- */
- jmp_offset = -2;
- else
- jmp_offset = addrs[i + insn->off] - addrs[i];
+ case BPF_JMP32 | BPF_JA:
+ if (BPF_CLASS(insn->code) == BPF_JMP) {
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ } else {
+ if (insn->imm == -1)
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->imm] - addrs[i];
+ }
if (!jmp_offset) {
/*
@@ -1857,59 +1950,177 @@ emit_jmp:
return proglen;
}
-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
- int stack_size)
+static void clean_stack_garbage(const struct btf_func_model *m,
+ u8 **pprog, int nr_stack_slots,
+ int stack_size)
+{
+ int arg_size, off;
+ u8 *prog;
+
+ /* Generally speaking, the compiler will pass the arguments
+ * on-stack with "push" instruction, which will take 8-byte
+ * on the stack. In this case, there won't be garbage values
+ * while we copy the arguments from origin stack frame to current
+ * in BPF_DW.
+ *
+ * However, sometimes the compiler will only allocate 4-byte on
+ * the stack for the arguments. For now, this case will only
+ * happen if there is only one argument on-stack and its size
+ * not more than 4 byte. In this case, there will be garbage
+ * values on the upper 4-byte where we store the argument on
+ * current stack frame.
+ *
+ * arguments on origin stack:
+ *
+ * stack_arg_1(4-byte) xxx(4-byte)
+ *
+ * what we copy:
+ *
+ * stack_arg_1(8-byte): stack_arg_1(origin) xxx
+ *
+ * and the xxx is the garbage values which we should clean here.
+ */
+ if (nr_stack_slots != 1)
+ return;
+
+ /* the size of the last argument */
+ arg_size = m->arg_size[m->nr_args - 1];
+ if (arg_size <= 4) {
+ off = -(stack_size - 4);
+ prog = *pprog;
+ /* mov DWORD PTR [rbp + off], 0 */
+ if (!is_imm8(off))
+ EMIT2_off32(0xC7, 0x85, off);
+ else
+ EMIT3(0xC7, 0x45, off);
+ EMIT(0, 4);
+ *pprog = prog;
+ }
+}
+
+/* get the count of the regs that are used to pass arguments */
+static int get_nr_used_regs(const struct btf_func_model *m)
{
- int i, j, arg_size;
- bool next_same_struct = false;
+ int i, arg_regs, nr_used_regs = 0;
+
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
+ if (nr_used_regs + arg_regs <= 6)
+ nr_used_regs += arg_regs;
+
+ if (nr_used_regs >= 6)
+ break;
+ }
+
+ return nr_used_regs;
+}
+
+static void save_args(const struct btf_func_model *m, u8 **prog,
+ int stack_size, bool for_call_origin)
+{
+ int arg_regs, first_off = 0, nr_regs = 0, nr_stack_slots = 0;
+ int i, j;
/* Store function arguments to stack.
* For a function that accepts two pointers the sequence will be:
* mov QWORD PTR [rbp-0x10],rdi
* mov QWORD PTR [rbp-0x8],rsi
*/
- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
- /* The arg_size is at most 16 bytes, enforced by the verifier. */
- arg_size = m->arg_size[j];
- if (arg_size > 8) {
- arg_size = 8;
- next_same_struct = !next_same_struct;
- }
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
- emit_stx(prog, bytes_to_bpf_size(arg_size),
- BPF_REG_FP,
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- -(stack_size - i * 8));
+ /* According to the research of Yonghong, struct members
+ * should be all in register or all on the stack.
+ * Meanwhile, the compiler will pass the argument on regs
+ * if the remaining regs can hold the argument.
+ *
+ * Disorder of the args can happen. For example:
+ *
+ * struct foo_struct {
+ * long a;
+ * int b;
+ * };
+ * int foo(char, char, char, char, char, struct foo_struct,
+ * char);
+ *
+ * the arg1-5,arg7 will be passed by regs, and arg6 will
+ * by stack.
+ */
+ if (nr_regs + arg_regs > 6) {
+ /* copy function arguments from origin stack frame
+ * into current stack frame.
+ *
+ * The starting address of the arguments on-stack
+ * is:
+ * rbp + 8(push rbp) +
+ * 8(return addr of origin call) +
+ * 8(return addr of the caller)
+ * which means: rbp + 24
+ */
+ for (j = 0; j < arg_regs; j++) {
+ emit_ldx(prog, BPF_DW, BPF_REG_0, BPF_REG_FP,
+ nr_stack_slots * 8 + 0x18);
+ emit_stx(prog, BPF_DW, BPF_REG_FP, BPF_REG_0,
+ -stack_size);
+
+ if (!nr_stack_slots)
+ first_off = stack_size;
+ stack_size -= 8;
+ nr_stack_slots++;
+ }
+ } else {
+ /* Only copy the arguments on-stack to current
+ * 'stack_size' and ignore the regs, used to
+ * prepare the arguments on-stack for orign call.
+ */
+ if (for_call_origin) {
+ nr_regs += arg_regs;
+ continue;
+ }
- j = next_same_struct ? j : j + 1;
+ /* copy the arguments from regs into stack */
+ for (j = 0; j < arg_regs; j++) {
+ emit_stx(prog, BPF_DW, BPF_REG_FP,
+ nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
+ -stack_size);
+ stack_size -= 8;
+ nr_regs++;
+ }
+ }
}
+
+ clean_stack_garbage(m, prog, nr_stack_slots, first_off);
}
-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+static void restore_regs(const struct btf_func_model *m, u8 **prog,
int stack_size)
{
- int i, j, arg_size;
- bool next_same_struct = false;
+ int i, j, arg_regs, nr_regs = 0;
/* Restore function arguments from stack.
* For a function that accepts two pointers the sequence will be:
* EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
* EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+ *
+ * The logic here is similar to what we do in save_args()
*/
- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
- /* The arg_size is at most 16 bytes, enforced by the verifier. */
- arg_size = m->arg_size[j];
- if (arg_size > 8) {
- arg_size = 8;
- next_same_struct = !next_same_struct;
+ for (i = 0; i < min_t(int, m->nr_args, MAX_BPF_FUNC_ARGS); i++) {
+ arg_regs = (m->arg_size[i] + 7) / 8;
+ if (nr_regs + arg_regs <= 6) {
+ for (j = 0; j < arg_regs; j++) {
+ emit_ldx(prog, BPF_DW,
+ nr_regs == 5 ? X86_REG_R9 : BPF_REG_1 + nr_regs,
+ BPF_REG_FP,
+ -stack_size);
+ stack_size -= 8;
+ nr_regs++;
+ }
+ } else {
+ stack_size -= 8 * arg_regs;
}
- emit_ldx(prog, bytes_to_bpf_size(arg_size),
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- BPF_REG_FP,
- -(stack_size - i * 8));
-
- j = next_same_struct ? j : j + 1;
+ if (nr_regs >= 6)
+ break;
}
}
@@ -1938,7 +2149,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
/* arg2: lea rsi, [rbp - ctx_cookie_off] */
- EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
+ if (!is_imm8(-run_ctx_off))
+ EMIT3_off32(0x48, 0x8D, 0xB5, -run_ctx_off);
+ else
+ EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
return -EINVAL;
@@ -1954,7 +2168,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */
- EMIT4(0x48, 0x8D, 0x7D, -stack_size);
+ if (!is_imm8(-stack_size))
+ EMIT3_off32(0x48, 0x8D, 0xBD, -stack_size);
+ else
+ EMIT4(0x48, 0x8D, 0x7D, -stack_size);
/* arg2: progs[i]->insnsi for interpreter */
if (!p->jited)
emit_mov_imm64(&prog, BPF_REG_2,
@@ -1984,7 +2201,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
/* arg2: mov rsi, rbx <- start time in nsec */
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
/* arg3: lea rdx, [rbp - run_ctx_off] */
- EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
+ if (!is_imm8(-run_ctx_off))
+ EMIT3_off32(0x48, 0x8D, 0x95, -run_ctx_off);
+ else
+ EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
return -EINVAL;
@@ -2136,7 +2356,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
void *func_addr)
{
int i, ret, nr_regs = m->nr_args, stack_size = 0;
- int regs_off, nregs_off, ip_off, run_ctx_off;
+ int regs_off, nregs_off, ip_off, run_ctx_off, arg_stack_off, rbx_off;
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -2150,8 +2370,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
nr_regs += (m->arg_size[i] + 7) / 8 - 1;
- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
- if (nr_regs > 6)
+ /* x86-64 supports up to MAX_BPF_FUNC_ARGS arguments. 1-6
+ * are passed through regs, the remains are through stack.
+ */
+ if (nr_regs > MAX_BPF_FUNC_ARGS)
return -ENOTSUPP;
/* Generated trampoline stack layout:
@@ -2170,7 +2392,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
*
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
*
+ * RBP - rbx_off [ rbx value ] always
+ *
* RBP - run_ctx_off [ bpf_tramp_run_ctx ]
+ *
+ * [ stack_argN ] BPF_TRAMP_F_CALL_ORIG
+ * [ ... ]
+ * [ stack_arg2 ]
+ * RBP - arg_stack_off [ stack_arg1 ]
*/
/* room for return value of orig_call or fentry prog */
@@ -2190,9 +2419,26 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
ip_off = stack_size;
+ stack_size += 8;
+ rbx_off = stack_size;
+
stack_size += (sizeof(struct bpf_tramp_run_ctx) + 7) & ~0x7;
run_ctx_off = stack_size;
+ if (nr_regs > 6 && (flags & BPF_TRAMP_F_CALL_ORIG)) {
+ /* the space that used to pass arguments on-stack */
+ stack_size += (nr_regs - get_nr_used_regs(m)) * 8;
+ /* make sure the stack pointer is 16-byte aligned if we
+ * need pass arguments on stack, which means
+ * [stack_size + 8(rbp) + 8(rip) + 8(origin rip)]
+ * should be 16-byte aligned. Following code depend on
+ * that stack_size is already 8-byte aligned.
+ */
+ stack_size += (stack_size % 16) ? 0 : 8;
+ }
+
+ arg_stack_off = stack_size;
+
if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@@ -2212,8 +2458,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
x86_call_depth_emit_accounting(&prog, NULL);
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
- EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
- EMIT1(0x53); /* push rbx */
+ if (!is_imm8(stack_size))
+ /* sub rsp, stack_size */
+ EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
+ else
+ /* sub rsp, stack_size */
+ EMIT4(0x48, 0x83, 0xEC, stack_size);
+ /* mov QWORD PTR [rbp - rbx_off], rbx */
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_6, -rbx_off);
/* Store number of argument registers of the traced function:
* mov rax, nr_regs
@@ -2231,7 +2483,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_regs, regs_off);
+ save_args(m, &prog, regs_off, false);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -2261,7 +2513,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_regs, regs_off);
+ restore_regs(m, &prog, regs_off);
+ save_args(m, &prog, arg_stack_off, true);
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
@@ -2302,7 +2555,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_regs, regs_off);
+ restore_regs(m, &prog, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
@@ -2321,7 +2574,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (save_ret)
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
- EMIT1(0x5B); /* pop rbx */
+ emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
EMIT1(0xC9); /* leave */
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip our return address and return to parent */
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index dd40d3fea74e..631512f7ec85 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -51,6 +51,14 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link)
return NULL;
}
+static inline resource_size_t cap_resource(u64 val)
+{
+ if (val > RESOURCE_SIZE_MAX)
+ return RESOURCE_SIZE_MAX;
+
+ return val;
+}
+
/**
* early_root_info_init()
* called before pcibios_scan_root and pci_scan_bus
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 2752c02e3f0e..e4a525e59eaf 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -101,7 +101,7 @@ void update_res(struct pci_root_info *info, resource_size_t start,
if (start > end)
return;
- if (start == MAX_RESOURCE)
+ if (start == RESOURCE_SIZE_MAX)
return;
if (!merge)
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index a498b847d740..0de436316a1d 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -136,14 +136,14 @@ static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr,
if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used)
return NULL;
- size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]);
+ size = struct_size(ir, slots, ir->used);
if (size > limit - addr)
return NULL;
DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n",
__pa(ir));
- size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]);
+ size = struct_size(rt, slots, ir->used);
rt = kzalloc(size, GFP_KERNEL);
if (!rt)
return NULL;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 014c508e914d..652cd53e77f6 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -517,7 +517,7 @@ int __init pci_xen_init(void)
#ifdef CONFIG_PCI_MSI
static void __init xen_hvm_msi_init(void)
{
- if (!disable_apic) {
+ if (!apic_is_disabled) {
/*
* If hardware supports (x2)APIC virtualization (as indicated
* by hypervisor's leaf 4) then we don't need to use pirqs/
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index ee21d6a36a80..4221259a5870 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -58,7 +58,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
ret = parent->chip->irq_set_affinity(parent, mask, force);
if (ret >= 0) {
uv_program_mmr(cfg, data->chip_data);
- send_cleanup_vector(cfg);
+ vector_schedule_cleanup(cfg);
}
return ret;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index a6ab43f69b7d..45d0c17ce77c 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -597,7 +597,7 @@ static void uv_nmi_nr_cpus_ping(void)
for_each_cpu(cpu, uv_nmi_cpu_mask)
uv_cpu_nmi_per(cpu).pinging = 1;
- apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
+ __apic_send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
}
/* Clean up flags for CPU's that ignored both NMI and ping */
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index ee89f6bb9242..8bc72a51b257 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -13,15 +13,16 @@ obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
- mem_$(BITS).o subarch.o os-$(OS)/
+ mem_$(BITS).o subarch.o os-Linux/
ifeq ($(CONFIG_X86_32),y)
-obj-y += checksum_32.o syscalls_32.o
+obj-y += syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o
+subarch-y += ../lib/checksum_32.o
subarch-y += ../kernel/sys_ia32.o
else
diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h
index 4a73d63e4760..dc32dc023c2f 100644
--- a/arch/x86/um/asm/mm_context.h
+++ b/arch/x86/um/asm/mm_context.h
@@ -11,8 +11,6 @@
#include <linux/mutex.h>
#include <asm/ldt.h>
-extern void ldt_host_info(void);
-
#define LDT_PAGES_MAX \
((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE)
#define LDT_ENTRIES_PER_PAGE \
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
deleted file mode 100644
index aed782ab7721..000000000000
--- a/arch/x86/um/checksum_32.S
+++ /dev/null
@@ -1,214 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IP/TCP/UDP checksumming routines
- *
- * Authors: Jorge Cwik, <jorge@laser.satlink.net>
- * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- * Tom May, <ftom@netcom.com>
- * Pentium Pro/II routines:
- * Alexander Kjeldaas <astor@guardian.no>
- * Finn Arne Gangstad <finnag@guardian.no>
- * Lots of code moved from tcp.c and ip.c; see those files
- * for more names.
- *
- * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
- * handling.
- * Andi Kleen, add zeroing on error
- * converted to pure assembler
- */
-
-#include <asm/errno.h>
-#include <asm/asm.h>
-#include <asm/export.h>
-
-/*
- * computes a partial checksum, e.g. for TCP/UDP fragments
- */
-
-/*
-unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
- */
-
-.text
-.align 4
-.globl csum_partial
-
-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
-
- /*
- * Experiments with Ethernet and SLIP connections show that buff
- * is aligned on either a 2-byte or 4-byte boundary. We get at
- * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
- * Fortunately, it is easy to convert 2-byte alignment to 4-byte
- * alignment for the unrolled loop.
- */
-csum_partial:
- pushl %esi
- pushl %ebx
- movl 20(%esp),%eax # Function arg: unsigned int sum
- movl 16(%esp),%ecx # Function arg: int len
- movl 12(%esp),%esi # Function arg: unsigned char *buff
- testl $2, %esi # Check alignment.
- jz 2f # Jump if alignment is ok.
- subl $2, %ecx # Alignment uses up two bytes.
- jae 1f # Jump if we had at least two bytes.
- addl $2, %ecx # ecx was < 2. Deal with it.
- jmp 4f
-1: movw (%esi), %bx
- addl $2, %esi
- addw %bx, %ax
- adcl $0, %eax
-2:
- movl %ecx, %edx
- shrl $5, %ecx
- jz 2f
- testl %esi, %esi
-1: movl (%esi), %ebx
- adcl %ebx, %eax
- movl 4(%esi), %ebx
- adcl %ebx, %eax
- movl 8(%esi), %ebx
- adcl %ebx, %eax
- movl 12(%esi), %ebx
- adcl %ebx, %eax
- movl 16(%esi), %ebx
- adcl %ebx, %eax
- movl 20(%esi), %ebx
- adcl %ebx, %eax
- movl 24(%esi), %ebx
- adcl %ebx, %eax
- movl 28(%esi), %ebx
- adcl %ebx, %eax
- lea 32(%esi), %esi
- dec %ecx
- jne 1b
- adcl $0, %eax
-2: movl %edx, %ecx
- andl $0x1c, %edx
- je 4f
- shrl $2, %edx # This clears CF
-3: adcl (%esi), %eax
- lea 4(%esi), %esi
- dec %edx
- jne 3b
- adcl $0, %eax
-4: andl $3, %ecx
- jz 7f
- cmpl $2, %ecx
- jb 5f
- movw (%esi),%cx
- leal 2(%esi),%esi
- je 6f
- shll $16,%ecx
-5: movb (%esi),%cl
-6: addl %ecx,%eax
- adcl $0, %eax
-7:
- popl %ebx
- popl %esi
- RET
-
-#else
-
-/* Version for PentiumII/PPro */
-
-csum_partial:
- pushl %esi
- pushl %ebx
- movl 20(%esp),%eax # Function arg: unsigned int sum
- movl 16(%esp),%ecx # Function arg: int len
- movl 12(%esp),%esi # Function arg: const unsigned char *buf
-
- testl $2, %esi
- jnz 30f
-10:
- movl %ecx, %edx
- movl %ecx, %ebx
- andl $0x7c, %ebx
- shrl $7, %ecx
- addl %ebx,%esi
- shrl $2, %ebx
- negl %ebx
- lea 45f(%ebx,%ebx,2), %ebx
- testl %esi, %esi
- jmp *%ebx
-
- # Handle 2-byte-aligned regions
-20: addw (%esi), %ax
- lea 2(%esi), %esi
- adcl $0, %eax
- jmp 10b
-
-30: subl $2, %ecx
- ja 20b
- je 32f
- movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
- addl %ebx, %eax
- adcl $0, %eax
- jmp 80f
-32:
- addw (%esi), %ax # csumming 2 bytes, 2-aligned
- adcl $0, %eax
- jmp 80f
-
-40:
- addl -128(%esi), %eax
- adcl -124(%esi), %eax
- adcl -120(%esi), %eax
- adcl -116(%esi), %eax
- adcl -112(%esi), %eax
- adcl -108(%esi), %eax
- adcl -104(%esi), %eax
- adcl -100(%esi), %eax
- adcl -96(%esi), %eax
- adcl -92(%esi), %eax
- adcl -88(%esi), %eax
- adcl -84(%esi), %eax
- adcl -80(%esi), %eax
- adcl -76(%esi), %eax
- adcl -72(%esi), %eax
- adcl -68(%esi), %eax
- adcl -64(%esi), %eax
- adcl -60(%esi), %eax
- adcl -56(%esi), %eax
- adcl -52(%esi), %eax
- adcl -48(%esi), %eax
- adcl -44(%esi), %eax
- adcl -40(%esi), %eax
- adcl -36(%esi), %eax
- adcl -32(%esi), %eax
- adcl -28(%esi), %eax
- adcl -24(%esi), %eax
- adcl -20(%esi), %eax
- adcl -16(%esi), %eax
- adcl -12(%esi), %eax
- adcl -8(%esi), %eax
- adcl -4(%esi), %eax
-45:
- lea 128(%esi), %esi
- adcl $0, %eax
- dec %ecx
- jge 40b
- movl %edx, %ecx
-50: andl $3, %ecx
- jz 80f
-
- # Handle the last 1-3 bytes without jumping
- notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
- movl $0xffffff,%ebx # by the shll and shrl instructions
- shll $3,%ecx
- shrl %cl,%ebx
- andl -128(%esi),%ebx # esi is 4-aligned so should be ok
- addl %ebx,%eax
- adcl $0,%eax
-80:
- popl %ebx
- popl %esi
- RET
-
-#endif
- EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
index 11640c116115..5ebe48752ffc 100644
--- a/arch/x86/video/Makefile
+++ b/arch/x86/video/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_FB) += fbdev.o
+obj-$(CONFIG_FB_CORE) += fbdev.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 62d34b6611c5..7ad91225fdf4 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -81,6 +81,11 @@ static void xen_apic_write(u32 reg, u32 val)
WARN(1,"register: %x, value: %x\n", reg, val);
}
+static void xen_apic_eoi(void)
+{
+ WARN_ON_ONCE(1);
+}
+
static u64 xen_apic_icr_read(void)
{
return 0;
@@ -92,11 +97,6 @@ static void xen_apic_icr_write(u32 low, u32 id)
WARN_ON(1);
}
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
static int xen_apic_probe_pv(void)
{
if (xen_pv_domain())
@@ -110,29 +110,11 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
return xen_pv_domain();
}
-static int xen_id_always_valid(u32 apicid)
-{
- return 1;
-}
-
-static int xen_id_always_registered(void)
-{
- return 1;
-}
-
static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
{
return initial_apic_id >> index_msb;
}
-static void xen_noop(void)
-{
-}
-
-static void xen_silent_inquire(int apicid)
-{
-}
-
static int xen_cpu_present_to_apicid(int cpu)
{
if (cpu_present(cpu))
@@ -141,68 +123,41 @@ static int xen_cpu_present_to_apicid(int cpu)
return BAD_APICID;
}
-static struct apic xen_pv_apic = {
- .name = "Xen PV",
- .probe = xen_apic_probe_pv,
+static struct apic xen_pv_apic __ro_after_init = {
+ .name = "Xen PV",
+ .probe = xen_apic_probe_pv,
.acpi_madt_oem_check = xen_madt_oem_check,
- .apic_id_valid = xen_id_always_valid,
- .apic_id_registered = xen_id_always_registered,
/* .delivery_mode and .dest_mode_logical not used by XENPV */
.disable_esr = 0,
- .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
- .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
- .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = xen_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
- .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
.phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
- .get_apic_id = xen_get_apic_id,
- .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = xen_get_apic_id,
+ .set_apic_id = xen_set_apic_id,
.calc_dest_apicid = apic_flat_calc_apicid,
#ifdef CONFIG_SMP
- .send_IPI_mask = xen_send_IPI_mask,
- .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
- .send_IPI_allbutself = xen_send_IPI_allbutself,
- .send_IPI_all = xen_send_IPI_all,
- .send_IPI_self = xen_send_IPI_self,
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
#endif
- /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
- .inquire_remote_apic = xen_silent_inquire,
-
.read = xen_apic_read,
.write = xen_apic_write,
- .eoi_write = xen_apic_write,
+ .eoi = xen_apic_eoi,
- .icr_read = xen_apic_icr_read,
- .icr_write = xen_apic_icr_write,
- .wait_icr_idle = xen_noop,
- .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
};
+apic_driver(xen_pv_apic);
-static void __init xen_apic_check(void)
-{
- if (apic == &xen_pv_apic)
- return;
-
- pr_info("Switched APIC routing from %s to %s.\n", apic->name,
- xen_pv_apic.name);
- apic = &xen_pv_apic;
-}
void __init xen_init_apic(void)
{
x86_apic_ops.io_apic_read = xen_io_apic_read;
- /* On PV guests the APIC CPUID bit is disabled so none of the
- * routines end up executing. */
- if (!xen_initial_domain())
- apic = &xen_pv_apic;
-
- x86_platform.apic_post_init = xen_apic_check;
}
-apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index a6820ca940bf..9a192f51f1b0 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -132,7 +132,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
struct pt_regs *old_regs = set_irq_regs(regs);
if (xen_percpu_upcall)
- ack_APIC_irq();
+ apic_eoi();
inc_irq_stat(irq_hv_callback_count);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 93b658248d01..49352fad7d1d 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -79,7 +79,7 @@
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
@@ -288,17 +288,17 @@ static bool __init xen_check_mwait(void)
native_cpuid(&ax, &bx, &cx, &dx);
- /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
- buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+ buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
- (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
@@ -523,7 +523,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- pfn = virt_to_pfn(va);
+ pfn = virt_to_pfn((void *)va);
mfn = pfn_to_mfn(pfn);
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
@@ -694,7 +694,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
TRAP_ENTRY(exc_control_protection, false ),
#endif
};
@@ -1326,7 +1326,7 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_select = x86_init_noop;
- x86_init.irqs.intr_mode_init = x86_init_noop;
+ x86_init.irqs.intr_mode_init = x86_64_probe_apic;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
x86_init.hyper.init_platform = xen_pv_init_platform;
@@ -1366,12 +1366,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_init_capabilities();
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
*/
xen_init_apic();
-#endif
machine_ops = xen_machine_ops;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index e0a975165de7..1652c39e3dfb 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -166,7 +166,7 @@ void make_lowmem_page_readwrite(void *vaddr)
if (pte == NULL)
return; /* vaddr missing */
- ptev = pte_mkwrite(*pte);
+ ptev = pte_mkwrite_novma(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
@@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
spinlock_t *ptl = NULL;
#if USE_SPLIT_PTE_PTLOCKS
- ptl = ptlock_ptr(page);
+ ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -2202,13 +2202,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
mcs = __xen_mc_entry(0);
if (in_frames)
- in_frames[i] = virt_to_mfn(vaddr);
+ in_frames[i] = virt_to_mfn((void *)vaddr);
MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
- __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+ __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY);
if (out_frames)
- out_frames[i] = virt_to_pfn(vaddr);
+ out_frames[i] = virt_to_pfn((void *)vaddr);
}
xen_mc_issue(0);
}
@@ -2250,7 +2250,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
MULTI_update_va_mapping(mcs.mc, vaddr,
mfn_pte(mfn, PAGE_KERNEL), flags);
- set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn);
}
xen_mc_issue(0);
@@ -2310,12 +2310,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
int success;
unsigned long vstart = (unsigned long)phys_to_virt(pstart);
- /*
- * Currently an auto-translated guest will not perform I/O, nor will
- * it require PAE page directories below 4GB. Therefore any calls to
- * this function are redundant and can be ignored.
- */
-
if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
@@ -2327,7 +2321,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
xen_zap_pfn_range(vstart, order, in_frames, NULL);
/* 2. Get a new contiguous memory extent. */
- out_frame = virt_to_pfn(vstart);
+ out_frame = virt_to_pfn((void *)vstart);
success = xen_exchange_memory(1UL << order, 0, in_frames,
1, order, &out_frame,
address_bits);
@@ -2360,7 +2354,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
spin_lock_irqsave(&xen_reservation_lock, flags);
/* 1. Find start MFN of contiguous extent. */
- in_frame = virt_to_mfn(vstart);
+ in_frame = virt_to_mfn((void *)vstart);
/* 2. Zap current PTEs. */
xen_zap_pfn_range(vstart, order, NULL, out_frames);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8b5cf7bb1f47..b3e37961065a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -44,6 +44,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
+/* Memory map would allow PCI passthrough. */
+bool xen_pv_pci_possible;
+
/* E820 map used during setting up memory. */
static struct e820_table xen_e820_table __initdata;
@@ -340,7 +343,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
WARN_ON(size == 0);
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
ident_pfn_iter < ident_end_pfn;
@@ -503,7 +506,7 @@ void __init xen_remap_memory(void)
unsigned long pfn_s = ~0UL;
unsigned long len = 0;
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
while (xen_remap_mfn != INVALID_P2M_ENTRY) {
/* Map the remap information */
@@ -814,6 +817,9 @@ char * __init xen_memory_setup(void)
chunk_size = size;
type = xen_e820_table.entries[i].type;
+ if (type == E820_TYPE_RESERVED)
+ xen_pv_pci_possible = true;
+
if (type == E820_TYPE_RAM) {
if (addr < mem_end) {
chunk_size = min(size, mem_end - addr);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index cef78b8c89f4..a0f07bbfcd6e 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -182,7 +182,8 @@ static void __init _get_smp_config(unsigned int early)
if (subtract)
set_nr_cpu_ids(nr_cpu_ids - subtract);
#endif
-
+ /* Pretend to be a proper enumerated system */
+ smp_found_config = 1;
}
static void __init xen_pv_smp_prepare_boot_cpu(void)
@@ -210,7 +211,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
char *m = (max_cpus == 0) ?
"The nosmp parameter is incompatible with Xen; " \
"use Xen dom0_max_vcpus=1 parameter" :
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 08f1ceb9eb81..9e5e68008785 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault
xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
xen_pv_trap asm_exc_control_protection
#endif
#ifdef CONFIG_X86_MCE