diff options
Diffstat (limited to 'arch/x86')
285 files changed, 5880 insertions, 3269 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2792879d398e..0fc82237414d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE @@ -360,10 +361,6 @@ config X86_64_SMP def_bool y depends on X86_64 && SMP -config X86_32_LAZY_GS - def_bool y - depends on X86_32 && !STACKPROTECTOR - config ARCH_SUPPORTS_UPROBES def_bool y @@ -386,7 +383,8 @@ config CC_HAS_SANE_STACKPROTECTOR default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC)) help We have to make sure stack protector is unconditionally disabled if - the compiler produces broken code. + the compiler produces broken code or if it does not let us control + the segment on 32-bit kernels. menu "Processor type and features" @@ -571,6 +569,7 @@ config X86_UV depends on X86_EXTENDED_PLATFORM depends on NUMA depends on EFI + depends on KEXEC_CORE depends on X86_X2APIC depends on PCI help @@ -777,6 +776,7 @@ if HYPERVISOR_GUEST config PARAVIRT bool "Enable paravirtualization code" + depends on HAVE_STATIC_CALL help This changes the kernel so it can modify itself when it is run under a hypervisor, potentially improving performance significantly @@ -1406,7 +1406,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 + depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 select X86_PAE help Select this if you have a 32-bit processor and more than 4 @@ -1518,6 +1518,7 @@ config AMD_MEM_ENCRYPT select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory @@ -1931,6 +1932,7 @@ config X86_SGX depends on CRYPTO_SHA256=y select SRCU select MMU_NOTIFIER + select NUMA_KEEP_MEMINFO if NUMA help Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions that can be used by applications to set aside private regions of code diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d6d5a28c3bf..465de4a78a74 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,12 +27,13 @@ endif REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) +REALMODE_CFLAGS += $(CLANG_FLAGS) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -79,6 +80,14 @@ ifeq ($(CONFIG_X86_32),y) # temporary until string.h is fixed KBUILD_CFLAGS += -ffreestanding + + ifeq ($(CONFIG_STACKPROTECTOR),y) + ifeq ($(CONFIG_SMP),y) + KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard + else + KBUILD_CFLAGS += -mstack-protector-guard=global + endif + endif else BITS := 64 UTS_MACHINE := x86_64 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e0bc3988c3fa..6e5522aebbbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -46,6 +46,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS # Disable relocation relaxation in case the link is not PIE. KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h +KBUILD_CFLAGS += $(CLANG_FLAGS) # sev-es.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S index c4bb0f9363f5..95a223b3e56a 100644 --- a/arch/x86/boot/compressed/efi_thunk_64.S +++ b/arch/x86/boot/compressed/efi_thunk_64.S @@ -5,7 +5,7 @@ * Early support for invoking 32-bit EFI services from a 64-bit kernel. * * Because this thunking occurs before ExitBootServices() we have to - * restore the firmware's 32-bit GDT before we make EFI serivce calls, + * restore the firmware's 32-bit GDT before we make EFI service calls, * since the firmware's 32-bit IDT is still currently installed and it * needs to be able to service interrupts. * diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e94874f4bbc1..a2347ded77ea 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -34,6 +34,7 @@ #include <asm/asm-offsets.h> #include <asm/bootparam.h> #include <asm/desc_defs.h> +#include <asm/trapnr.h> #include "pgtable.h" /* @@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32) movl %eax, %gs movl %eax, %ss -/* setup a stack and make sure cpu supports long mode. */ + /* Setup a stack and load CS from current GDT */ leal rva(boot_stack_end)(%ebp), %esp + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: + + /* Setup Exception handling for SEV-ES */ + call startup32_load_idt + + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax jnz .Lno_longmode @@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32) */ call get_sev_encryption_bit xorl %edx, %edx +#ifdef CONFIG_AMD_MEM_ENCRYPT testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ + /* + * Mark SEV as active in sev_status so that startup32_check_sev_cbit() + * will do a check. The sev_status memory will be fully initialized + * with the contents of MSR_AMD_SEV_STATUS later in + * set_sev_encryption_mask(). For now it is sufficient to know that SEV + * is active. + */ + movl $1, rva(sev_status)(%ebp) 1: +#endif /* Initialize Page tables to 0 */ leal rva(pgtable)(%ebx), %edi @@ -231,7 +252,7 @@ SYM_FUNC_START(startup_32) /* * Setup for the jump to 64bit mode * - * When the jump is performend we will be in long mode but + * When the jump is performed we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 * (and in turn EFER.LMA = 1). To jump into 64bit mode we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. @@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32) movl %esi, %edx 1: #endif + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit + pushl $__KERNEL_CS pushl %eax @@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA_START(boot32_idt_desc) + .word boot32_idt_end - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) + .balign 8 +SYM_DATA_START(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) +#endif + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif @@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto) SYM_DATA_END(loaded_image_proto) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + __HEAD + .code32 +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * + * Physical offset is expected in %ebp + */ +SYM_FUNC_START(startup32_set_idt_entry) + push %ebx + push %ecx + + /* IDT entry address to %ebx */ + leal rva(boot32_idt)(%ebp), %ebx + shl $3, %edx + addl %edx, %ebx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + movl $__KERNEL32_CS, %ecx # Target code segment selector + shl $16, %ecx + orl %ecx, %edx + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ebx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ebx) + + pop %ecx + pop %ebx + ret +SYM_FUNC_END(startup32_set_idt_entry) +#endif + +SYM_FUNC_START(startup32_load_idt) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* #VC handler */ + leal rva(startup32_vc_handler)(%ebp), %eax + movl $X86_TRAP_VC, %edx + call startup32_set_idt_entry + + /* Load IDT */ + leal rva(boot32_idt)(%ebp), %eax + movl %eax, rva(boot32_idt_desc+2)(%ebp) + lidt rva(boot32_idt_desc)(%ebp) +#endif + ret +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Check for non-zero sev_status */ + movl rva(sev_status)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + movl %eax, rva(sev_check_data)(%ebp) + movl %ebx, rva(sev_check_data+4)(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, rva(sev_check_data)(%ebp) + jne 3f + cmpl %ebx, rva(sev_check_data+4)(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %edx + popl %ecx + popl %ebx + popl %eax +#endif + ret +SYM_FUNC_END(startup32_check_sev_cbit) + /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 804a502ee0d2..9b93567d663a 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -52,3 +52,17 @@ void load_stage2_idt(void) load_boot_idt(&boot_idt_desc); } + +void cleanup_exception_handling(void) +{ + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + + /* Set a null-idt, disabling #PF and #VC handling */ + boot_idt_desc.size = 0; + boot_idt_desc.address = 0; + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b92fffbe761f..e36690778497 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -639,9 +639,9 @@ static bool process_mem_region(struct mem_vector *region, if (slot_area_index == MAX_SLOT_AREA) { debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); - return 1; + return true; } - return 0; + return false; } #if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index aa561795efd1..c1e81a848b2a 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit) push %ecx push %edx - /* Check if running under a hypervisor */ - movl $1, %eax - cpuid - bt $31, %ecx /* Check the hypervisor bit */ - jnc .Lno_sev - movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ @@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit) ret SYM_FUNC_END(get_sev_encryption_bit) +/** + * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using + * the GHCB MSR protocol + * + * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX) + * @%edx: CPUID Function + * + * Returns 0 in %eax on success, non-zero on failure + * %edx returns CPUID value on success + */ +SYM_CODE_START_LOCAL(sev_es_req_cpuid) + shll $30, %eax + orl $0x00000004, %eax + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall # VMGEXIT + rdmsr + + /* Check response */ + movl %eax, %ecx + andl $0x3ffff000, %ecx # Bits [12-29] MBZ + jnz 2f + + /* Check return code */ + andl $0xfff, %eax + cmpl $5, %eax + jne 2f + + /* All good - return success */ + xorl %eax, %eax +1: + ret +2: + movl $-1, %eax + jmp 1b +SYM_CODE_END(sev_es_req_cpuid) + +SYM_CODE_START(startup32_vc_handler) + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Keep CPUID function in %ebx */ + movl %eax, %ebx + + /* Check if error-code == SVM_EXIT_CPUID */ + cmpl $0x72, 16(%esp) + jne .Lfail + + movl $0, %eax # Request CPUID[fn].EAX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 12(%esp) # Store result + + movl $1, %eax # Request CPUID[fn].EBX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 8(%esp) # Store result + + movl $2, %eax # Request CPUID[fn].ECX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 4(%esp) # Store result + + movl $3, %eax # Request CPUID[fn].EDX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 0(%esp) # Store result + + /* + * Sanity check CPUID results from the Hypervisor. See comment in + * do_vc_no_ghcb() for more details on why this is necessary. + */ + + /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */ + cmpl $0x80000000, %ebx + jne .Lcheck_sev + cmpl $0x8000001f, 12(%esp) + jb .Lfail + jmp .Ldone + +.Lcheck_sev: + /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */ + cmpl $0x8000001f, %ebx + jne .Ldone + btl $1, 12(%esp) + jnc .Lfail + +.Ldone: + popl %edx + popl %ecx + popl %ebx + popl %eax + + /* Remove error code */ + addl $4, %esp + + /* Jump over CPUID instruction */ + addl $2, (%esp) + + iret +.Lfail: + /* Send terminate request to Hypervisor */ + movl $0x100, %eax + xorl %edx, %edx + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall + + /* If request fails, go to hlt loop */ + hlt + jmp .Lfail +SYM_CODE_END(startup32_vc_handler) + .code64 #include "../../kernel/sev_verify_cbit.S" - SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 267e7f93050e..dde042f64cca 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -430,8 +430,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, error("Destination address too large"); #endif #ifndef CONFIG_RELOCATABLE - if ((unsigned long)output != LOAD_PHYSICAL_ADDR) - error("Destination address does not match LOAD_PHYSICAL_ADDR"); if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif @@ -443,11 +441,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); - /* - * Flush GHCB from cache and map it encrypted again when running as - * SEV-ES guest. - */ - sev_es_shutdown_ghcb(); + /* Disable exception handling before booting the kernel */ + cleanup_exception_handling(); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 901ea5ebec22..e5612f035498 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask; extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; extern struct desc_ptr boot_idt_desc; +#ifdef CONFIG_X86_64 +void cleanup_exception_handling(void); +#else +static inline void cleanup_exception_handling(void) { } +#endif + /* IDT Entry Points */ void boot_page_fault(void); void boot_stage1_vc(void); diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c index 27826c265aab..82041bd380e5 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev-es.c @@ -78,16 +78,15 @@ static inline void sev_es_wr_ghcb_msr(u64 val) static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) { char buffer[MAX_INSN_SIZE]; - enum es_result ret; + int ret; memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); - insn_get_length(&ctxt->insn); + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; - ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; - - return ret; + return ES_OK; } static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, @@ -200,14 +199,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) } finish: - if (result == ES_OK) { + if (result == ES_OK) vc_finish_insn(&ctxt); - } else if (result != ES_RETRY) { - /* - * For now, just halt the machine. That makes debugging easier, - * later we just call sev_es_terminate() here. - */ - while (true) - asm volatile("hlt\n"); - } + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index b28e36b7c96b..d0959e7b809f 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,8 +2,6 @@ # # x86 crypto algorithms -OBJECT_FILES_NON_STANDARD := y - obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 2cf8e94d986a..98e3552b6e03 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -212,10 +212,6 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg4 %rcx #define arg5 %r8 #define arg6 %r9 -#define arg7 STACK_OFFSET+8*1(%r14) -#define arg8 STACK_OFFSET+8*2(%r14) -#define arg9 STACK_OFFSET+8*3(%r14) -#define arg10 STACK_OFFSET+8*4(%r14) #define keysize 2*15*16(arg1) i = 0 @@ -237,9 +233,6 @@ define_reg j %j .noaltmacro .endm -# need to push 4 registers into stack to maintain -STACK_OFFSET = 8*4 - TMP1 = 16*0 # Temporary storage for AAD TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) TMP3 = 16*2 # Temporary storage for AES State 3 @@ -256,25 +249,22 @@ VARIABLE_OFFSET = 16*8 ################################ .macro FUNC_SAVE - #the number of pushes must equal STACK_OFFSET push %r12 push %r13 - push %r14 push %r15 - mov %rsp, %r14 - - + push %rbp + mov %rsp, %rbp sub $VARIABLE_OFFSET, %rsp and $~63, %rsp # align rsp to 64 bytes .endm .macro FUNC_RESTORE - mov %r14, %rsp + mov %rbp, %rsp + pop %rbp pop %r15 - pop %r14 pop %r13 pop %r12 .endm @@ -294,7 +284,7 @@ VARIABLE_OFFSET = 16*8 # combined for GCM encrypt and decrypt functions # clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 +# clobbering r10, r11, r12, r13, r15, rax .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP vmovdqu AadHash(arg2), %xmm8 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey @@ -996,7 +986,7 @@ _partial_block_done_\@: ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg3, arg4, r14 are used as a pointer only, not modified +## arg1, arg2, arg3, arg4 are used as pointers only, not modified .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC i = (8-\num_initial_blocks) @@ -1231,7 +1221,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg3, arg4 are used as pointers only, not modified +# arg1, arg2, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC @@ -1944,7 +1934,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg3, arg4, r14 are used as a pointer only, not modified +## arg1, arg2, arg3, arg4 are used as pointers only, not modified .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) @@ -2186,7 +2176,7 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg3, arg4 are used as pointers only, not modified +# arg1, arg2, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 782e9712a1ec..706f70829a07 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -990,6 +990,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way) * %rdx: src (32 blocks) */ FRAME_BEGIN + subq $(16 * 32), %rsp; vzeroupper; @@ -1002,7 +1003,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, %rdx, (key_table)(CTX, %r8, 8)); - movq %rsp, %r10; cmpq %rsi, %rdx; je .Lcbc_dec_use_stack; @@ -1015,7 +1015,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) * dst still in-use (because dst == src), so use stack for temporary * storage. */ - subq $(16 * 32), %rsp; movq %rsp, %rax; .Lcbc_dec_continue: @@ -1025,7 +1024,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way) vpxor %ymm7, %ymm7, %ymm7; vinserti128 $1, (%rdx), %ymm7, %ymm7; vpxor (%rax), %ymm7, %ymm7; - movq %r10, %rsp; vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; @@ -1047,6 +1045,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way) vzeroupper; + addq $(16 * 32), %rsp; FRAME_END ret; SYM_FUNC_END(camellia_cbc_dec_32way) diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 7c4c7b2fbf05..98cf3b4e4c9f 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -24,7 +24,7 @@ /* * Copyright 2012 Xyratex Technology Limited * - * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation. + * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation. */ #include <linux/init.h> #include <linux/module.h> diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 884dc767b051..ac1f303eed0f 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -53,7 +53,7 @@ .endm .macro JMPTBL_ENTRY i -.word crc_\i - crc_array +.quad crc_\i .endm .macro JNC_LESS_THAN j @@ -168,10 +168,7 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), %bufp - movzwq (%bufp, %rax, 2), len - lea crc_array(%rip), %bufp - lea (%bufp, len, 1), %bufp + mov jump_table(,%rax,8), %bufp JMP_NOSPEC bufp ################################################################ diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 5af8021b98ce..6706b6cb1d0f 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -114,11 +114,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) ); } -/* Computes the field substraction of two field elements */ +/* Computes the field subtraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - /* Compute the raw substraction of f1-f2 */ + /* Compute the raw subtraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" @@ -135,7 +135,7 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" - /* Step 2: Substract carry*38 from the original difference */ + /* Step 2: Subtract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 646da46e8d10..1dfb8af48a3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -16,7 +16,7 @@ #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_KEY_SIZE]); + const u8 key[POLY1305_BLOCK_SIZE]); asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, const size_t len, const u32 padbit); asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], @@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx) state->is_base2_26 = 0; } -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE]) +static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) { poly1305_init_x86_64(ctx, key); } @@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], poly1305_emit_avx(ctx, mac, nonce); } -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key) +void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) { poly1305_simd_init(&dctx->h, key); dctx->s[0] = get_unaligned_le32(&key[16]); diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1e594d60afa5..5eed620f4676 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -645,9 +645,9 @@ _loop3: RESERVE_STACK = (W_SIZE*4 + 8+24) /* Align stack */ - mov %rsp, %rbx + push %rbp + mov %rsp, %rbp and $~(0x20-1), %rsp - push %rbx sub $RESERVE_STACK, %rsp avx2_zeroupper @@ -665,8 +665,8 @@ _loop3: avx2_zeroupper - add $RESERVE_STACK, %rsp - pop %rsp + mov %rbp, %rsp + pop %rbp pop %r15 pop %r14 diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S index 11efe3a45a1f..5d8415f482bd 100644 --- a/arch/x86/crypto/sha1_ni_asm.S +++ b/arch/x86/crypto/sha1_ni_asm.S @@ -59,8 +59,6 @@ #define DATA_PTR %rsi /* 2nd arg */ #define NUM_BLKS %rdx /* 3rd arg */ -#define RSPSAVE %rax - /* gcc conversion */ #define FRAME_SIZE 32 /* space for 2x16 bytes */ @@ -96,7 +94,8 @@ .text .align 32 SYM_FUNC_START(sha1_ni_transform) - mov %rsp, RSPSAVE + push %rbp + mov %rsp, %rbp sub $FRAME_SIZE, %rsp and $~0xF, %rsp @@ -288,7 +287,8 @@ SYM_FUNC_START(sha1_ni_transform) pextrd $3, E0, 1*16(DIGEST_PTR) .Ldone_hash: - mov RSPSAVE, %rsp + mov %rbp, %rsp + pop %rbp ret SYM_FUNC_END(sha1_ni_transform) diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 11ff60c29c8b..4087f7432a7e 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -117,15 +117,13 @@ _XMM_SAVE_SIZE = 0 _INP_END_SIZE = 8 _INP_SIZE = 8 _CTX_SIZE = 8 -_RSP_SIZE = 8 _XFER = 0 _XMM_SAVE = _XFER + _XFER_SIZE _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE _INP = _INP_END + _INP_END_SIZE _CTX = _INP + _INP_SIZE -_RSP = _CTX + _CTX_SIZE -STACK_SIZE = _RSP + _RSP_SIZE +STACK_SIZE = _CTX + _CTX_SIZE # rotate_Xs # Rotate values of symbols X0...X3 @@ -533,11 +531,11 @@ SYM_FUNC_START(sha256_transform_rorx) pushq %r14 pushq %r15 - mov %rsp, %rax + push %rbp + mov %rsp, %rbp + subq $STACK_SIZE, %rsp and $-32, %rsp # align rsp to 32 byte boundary - mov %rax, _RSP(%rsp) - shl $6, NUM_BLKS # convert to bytes jz done_hash @@ -704,7 +702,8 @@ only_one_block: done_hash: - mov _RSP(%rsp), %rsp + mov %rbp, %rsp + pop %rbp popq %r15 popq %r14 diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 684d58c8bc4f..3d8f0fd4eea8 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -76,14 +76,10 @@ tmp0 = %rax W_SIZE = 80*8 # W[t] + K[t] | W[t+1] + K[t+1] WK_SIZE = 2*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_W = 0 frame_WK = frame_W + W_SIZE -frame_RSPSAVE = frame_WK + WK_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays @@ -281,18 +277,18 @@ SYM_FUNC_START(sha512_transform_avx) test msglen, msglen je nowork + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, frame_GPRSAVE(%rsp) - mov %r12, frame_GPRSAVE +8*1(%rsp) - mov %r13, frame_GPRSAVE +8*2(%rsp) - mov %r14, frame_GPRSAVE +8*3(%rsp) - mov %r15, frame_GPRSAVE +8*4(%rsp) updateblock: @@ -353,15 +349,16 @@ updateblock: dec msglen jnz updateblock - # Restore GPRs - mov frame_GPRSAVE(%rsp), %rbx - mov frame_GPRSAVE +8*1(%rsp), %r12 - mov frame_GPRSAVE +8*2(%rsp), %r13 - mov frame_GPRSAVE +8*3(%rsp), %r14 - mov frame_GPRSAVE +8*4(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx nowork: ret diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 3a44bdcfd583..072cb0f0deae 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -102,17 +102,13 @@ SRND_SIZE = 1*8 INP_SIZE = 1*8 INPEND_SIZE = 1*8 CTX_SIZE = 1*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_XFER = 0 frame_SRND = frame_XFER + XFER_SIZE frame_INP = frame_SRND + SRND_SIZE frame_INPEND = frame_INP + INP_SIZE frame_CTX = frame_INPEND + INPEND_SIZE -frame_RSPSAVE = frame_CTX + CTX_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_CTX + CTX_SIZE ## assume buffers not aligned #define VMOVDQ vmovdqu @@ -570,18 +566,18 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE # "blocks" is the message length in SHA512 blocks ######################################################################## SYM_FUNC_START(sha512_transform_rorx) + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, 8*0+frame_GPRSAVE(%rsp) - mov %r12, 8*1+frame_GPRSAVE(%rsp) - mov %r13, 8*2+frame_GPRSAVE(%rsp) - mov %r14, 8*3+frame_GPRSAVE(%rsp) - mov %r15, 8*4+frame_GPRSAVE(%rsp) shl $7, NUM_BLKS # convert to bytes jz done_hash @@ -672,15 +668,17 @@ loop2: done_hash: -# Restore GPRs - mov 8*0+frame_GPRSAVE(%rsp), %rbx - mov 8*1+frame_GPRSAVE(%rsp), %r12 - mov 8*2+frame_GPRSAVE(%rsp), %r13 - mov 8*3+frame_GPRSAVE(%rsp), %r14 - mov 8*4+frame_GPRSAVE(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + ret SYM_FUNC_END(sha512_transform_rorx) diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index 50812af0b083..bd51c9070bed 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -74,14 +74,10 @@ tmp0 = %rax W_SIZE = 80*8 WK_SIZE = 2*8 -RSPSAVE_SIZE = 1*8 -GPRSAVE_SIZE = 5*8 frame_W = 0 frame_WK = frame_W + W_SIZE -frame_RSPSAVE = frame_WK + WK_SIZE -frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE -frame_size = frame_GPRSAVE + GPRSAVE_SIZE +frame_size = frame_WK + WK_SIZE # Useful QWORD "arrays" for simpler memory references # MSG, DIGEST, K_t, W_t are arrays @@ -283,18 +279,18 @@ SYM_FUNC_START(sha512_transform_ssse3) test msglen, msglen je nowork + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + # Allocate Stack Space - mov %rsp, %rax + push %rbp + mov %rsp, %rbp sub $frame_size, %rsp and $~(0x20 - 1), %rsp - mov %rax, frame_RSPSAVE(%rsp) - - # Save GPRs - mov %rbx, frame_GPRSAVE(%rsp) - mov %r12, frame_GPRSAVE +8*1(%rsp) - mov %r13, frame_GPRSAVE +8*2(%rsp) - mov %r14, frame_GPRSAVE +8*3(%rsp) - mov %r15, frame_GPRSAVE +8*4(%rsp) updateblock: @@ -355,15 +351,16 @@ updateblock: dec msglen jnz updateblock - # Restore GPRs - mov frame_GPRSAVE(%rsp), %rbx - mov frame_GPRSAVE +8*1(%rsp), %r12 - mov frame_GPRSAVE +8*2(%rsp), %r13 - mov frame_GPRSAVE +8*3(%rsp), %r14 - mov frame_GPRSAVE +8*4(%rsp), %r15 - # Restore Stack Pointer - mov frame_RSPSAVE(%rsp), %rsp + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx nowork: ret diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index fc23552afe37..bca4cea757ce 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -88,7 +88,7 @@ /* * Combined G1 & G2 function. Reordered with help of rotates to have moves - * at begining. + * at beginning. */ #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ /* G1,1 && G2,1 */ \ diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 03725696397c..3507cf2064f1 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -117,7 +117,7 @@ static bool is_blacklisted_cpu(void) * storing blocks in 64bit registers to allow three blocks to * be processed parallel. Parallel operation then allows gaining * more performance than was trade off, on out-of-order CPUs. - * However Atom does not benefit from this parallellism and + * However Atom does not benefit from this parallelism and * should be blacklisted. */ return true; diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 4efd39aacb9f..7b2542b13ebd 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -38,6 +38,7 @@ #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { + add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); @@ -83,6 +84,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { unsigned int nr = syscall_32_enter(regs); + add_random_kstack_offset(); /* * Subtlety here: if ptrace pokes something larger than 2^32-1 into * orig_ax, the unsigned int return value truncates it. This may @@ -102,6 +104,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) unsigned int nr = syscall_32_enter(regs); int res; + add_random_kstack_offset(); /* * This cannot use syscall_enter_from_user_mode() as it has to * fetch EBP before invoking any of the syscall entry work diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index df8c017e6161..ccb9d32768f3 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -20,7 +20,7 @@ * 1C(%esp) - %ds * 20(%esp) - %es * 24(%esp) - %fs - * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 28(%esp) - unused -- was %gs on old stackprotector kernels * 2C(%esp) - orig_eax * 30(%esp) - %eip * 34(%esp) - %cs @@ -40,7 +40,7 @@ #include <asm/processor-flags.h> #include <asm/irq_vectors.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/frame.h> @@ -53,83 +53,6 @@ #define PTI_SWITCH_MASK (1 << PAGE_SHIFT) -/* - * User gs save/restore - * - * %gs is used for userland TLS and kernel only uses it for stack - * canary which is required to be at %gs:20 by gcc. Read the comment - * at the top of stackprotector.h for more info. - * - * Local labels 98 and 99 are used. - */ -#ifdef CONFIG_X86_32_LAZY_GS - - /* unfortunately push/pop can't be no-op */ -.macro PUSH_GS - pushl $0 -.endm -.macro POP_GS pop=0 - addl $(4 + \pop), %esp -.endm -.macro POP_GS_EX -.endm - - /* all the rest are no-op */ -.macro PTGS_TO_GS -.endm -.macro PTGS_TO_GS_EX -.endm -.macro GS_TO_REG reg -.endm -.macro REG_TO_PTGS reg -.endm -.macro SET_KERNEL_GS reg -.endm - -#else /* CONFIG_X86_32_LAZY_GS */ - -.macro PUSH_GS - pushl %gs -.endm - -.macro POP_GS pop=0 -98: popl %gs - .if \pop <> 0 - add $\pop, %esp - .endif -.endm -.macro POP_GS_EX -.pushsection .fixup, "ax" -99: movl $0, (%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b, 99b) -.endm - -.macro PTGS_TO_GS -98: mov PT_GS(%esp), %gs -.endm -.macro PTGS_TO_GS_EX -.pushsection .fixup, "ax" -99: movl $0, PT_GS(%esp) - jmp 98b -.popsection - _ASM_EXTABLE(98b, 99b) -.endm - -.macro GS_TO_REG reg - movl %gs, \reg -.endm -.macro REG_TO_PTGS reg - movl \reg, PT_GS(%esp) -.endm -.macro SET_KERNEL_GS reg - movl $(__KERNEL_STACK_CANARY), \reg - movl \reg, %gs -.endm - -#endif /* CONFIG_X86_32_LAZY_GS */ - /* Unconditionally switch to user cr3 */ .macro SWITCH_TO_USER_CR3 scratch_reg:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI @@ -209,7 +132,7 @@ * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 6 enties as gap: + * the original 6 entries as gap: * * 14*4(%esp) - <previous context> * 13*4(%esp) - gap / flags @@ -282,7 +205,7 @@ .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0 cld .if \skip_gs == 0 - PUSH_GS + pushl $0 .endif pushl %fs @@ -307,9 +230,6 @@ movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es -.if \skip_gs == 0 - SET_KERNEL_GS %edx -.endif /* Switch to kernel stack if necessary */ .if \switch_stacks > 0 SWITCH_TO_KERNEL_STACK @@ -348,7 +268,7 @@ 1: popl %ds 2: popl %es 3: popl %fs - POP_GS \pop + addl $(4 + \pop), %esp /* pop the unused "gs" slot */ IRET_FRAME .pushsection .fixup, "ax" 4: movl $0, (%esp) @@ -361,7 +281,6 @@ _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 5b) _ASM_EXTABLE(3b, 6b) - POP_GS_EX .endm .macro RESTORE_ALL_NMI cr3_reg:req pop=0 @@ -430,7 +349,7 @@ * will soon execute iret and the tracer was already set to * the irqstate after the IRET: */ - DISABLE_INTERRUPTS(CLBR_ANY) + cli lss (%esp), %esp /* switch to espfix segment */ .Lend_\@: #endif /* CONFIG_X86_ESPFIX32 */ @@ -779,7 +698,7 @@ SYM_CODE_START(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movl TASK_stack_canary(%edx), %ebx - movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset + movl %ebx, PER_CPU_VAR(__stack_chk_guard) #endif #ifdef CONFIG_RETPOLINE @@ -976,7 +895,6 @@ SYM_FUNC_START(entry_SYSENTER_32) movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ 1: mov PT_FS(%esp), %fs - PTGS_TO_GS popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ @@ -1012,7 +930,6 @@ SYM_FUNC_START(entry_SYSENTER_32) jmp 1b .popsection _ASM_EXTABLE(1b, 2b) - PTGS_TO_GS_EX .Lsysenter_fix_flags: pushl $X86_EFLAGS_FIXED @@ -1077,7 +994,7 @@ restore_all_switch_stack: * when returning from IPI handler and when returning from * scheduler to user-space. */ - INTERRUPT_RETURN + iret .section .fixup, "ax" SYM_CODE_START(asm_iret_error) @@ -1154,11 +1071,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception) SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - /* fixup %gs */ - GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address - REG_TO_PTGS %ecx - SET_KERNEL_GS %ecx /* fixup orig %eax */ movl PT_ORIG_EAX(%esp), %edx # get the error code diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 400908dff42e..a16a5294d55f 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -305,7 +305,7 @@ SYM_CODE_END(ret_from_fork) .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax - SAVE_FLAGS(CLBR_RAX) + SAVE_FLAGS testl $X86_EFLAGS_IF, %eax jz .Lokay_\@ ud2 @@ -511,7 +511,7 @@ SYM_CODE_START(\asmsym) /* * No need to switch back to the IST stack. The current stack is either * identical to the stack in the IRET frame or the VC fall-back stack, - * so it is definitly mapped even with PTI enabled. + * so it is definitely mapped even with PTI enabled. */ jmp paranoid_exit diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 2d0f3d8bcc25..edfe9780f6d1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -218,7 +218,7 @@ int main(int argc, char **argv) /* * Figure out the struct name. If we're writing to a .so file, - * generate raw output insted. + * generate raw output instead. */ name = strdup(argv[3]); namelen = strlen(name); diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 1c7cfac7e64a..5264daa8859f 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -35,7 +35,7 @@ static void BITSFUNC(extract)(const unsigned char *data, size_t data_len, if (offset + len > data_len) fail("section to extract overruns input data"); - fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len); + fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len); BITSFUNC(copy)(outfile, data + offset, len); fprintf(outfile, "\n};\n\n"); } diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index de1fff7188aa..6ddd7a937b3e 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -6,7 +6,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .text .globl __kernel_vsyscall @@ -29,7 +29,7 @@ __kernel_vsyscall: * anyone with an AMD CPU, for example). Nonetheless, we try to keep * it working approximately as well as it ever worked. * - * This link may eludicate some of the history: + * This link may elucidate some of the history: * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7 * personally, I find it hard to understand what's going on there. * diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 825e829ffff1..235a5794296a 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -358,7 +358,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) mmap_write_lock(mm); /* * Check if we have already mapped vdso blob - fail to prevent - * abusing from userspace install_speciall_mapping, which may + * abusing from userspace install_special_mapping, which may * not do accounting and rlimit right. * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S index 86a0e94f68df..99dafac992e2 100644 --- a/arch/x86/entry/vdso/vsgx.S +++ b/arch/x86/entry/vdso/vsgx.S @@ -137,7 +137,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave) /* * If the return from callback is zero or negative, return immediately, - * else re-execute ENCLU with the postive return value interpreted as + * else re-execute ENCLU with the positive return value interpreted as * the requested ENCLU function. */ cmp $0, %eax diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2c1791c4a518..9687a8aef01c 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -623,7 +623,7 @@ static void amd_pmu_disable_all(void) /* * Check each counter for overflow and wait for it to be reset by the * NMI if it has overflowed. This relies on the fact that all active - * counters are always enabled when this function is caled and + * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ for (idx = 0; idx < x86_pmu.num_counters; idx++) { diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index be50ef8572cc..6a98a7651621 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -81,12 +81,12 @@ static struct attribute_group amd_iommu_events_group = { }; struct amd_iommu_event_desc { - struct kobj_attribute attr; + struct device_attribute attr; const char *event; }; -static ssize_t _iommu_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t _iommu_event_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct amd_iommu_event_desc *event = container_of(attr, struct amd_iommu_event_desc, attr); diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 0e5c036fd7be..e6493a67f1c6 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -17,7 +17,7 @@ #define IOMMU_PC_DEVID_MATCH_REG 0x20 #define IOMMU_PC_COUNTER_REPORT_REG 0x28 -/* maximun specified bank/counters */ +/* maximum specified bank/counters */ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 7f014d450bc2..582c0ffb5e98 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -275,14 +275,14 @@ static struct attribute_group amd_uncore_attr_group = { }; #define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ +static ssize_t __uncore_##_var##_show(struct device *dev, \ + struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ -static struct kobj_attribute format_attr_##_var = \ +static struct device_attribute format_attr_##_var = \ __ATTR(_name, 0444, __uncore_##_var##_show, NULL) DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 18df17129695..8e509325c2c3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -45,13 +45,16 @@ #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; +static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, + .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); +DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined @@ -151,15 +154,16 @@ again: */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; - if (!x86_pmu.extra_regs) + if (!extra_regs) return 0; - for (er = x86_pmu.extra_regs; er->msr; er++) { + for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) @@ -182,16 +186,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC +static inline int get_possible_num_counters(void) +{ + int i, num_counters = x86_pmu.num_counters; + + if (!is_hybrid()) + return num_counters; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) + num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); + + return num_counters; +} + static bool reserve_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -202,7 +219,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu_config_addr(i)); - i = x86_pmu.num_counters; + i = num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -213,9 +230,9 @@ perfctr_fail: static void release_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } @@ -228,7 +245,7 @@ static void release_pmc_hardware(void) {} #endif -static bool check_hw_exists(void) +bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; @@ -239,7 +256,7 @@ static bool check_hw_exists(void) * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) @@ -253,15 +270,15 @@ static bool check_hw_exists(void) } } - if (x86_pmu.num_counters_fixed) { + if (num_counters_fixed) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - for (i = 0; i < x86_pmu.num_counters_fixed; i++) { - if (fixed_counter_disabled(i)) + for (i = 0; i < num_counters_fixed; i++) { + if (fixed_counter_disabled(i, pmu)) continue; - if (val & (0x03 << i*4)) { + if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; @@ -360,8 +377,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - + val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; @@ -369,7 +385,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; hwc->config |= val; - attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } @@ -462,7 +478,7 @@ int x86_setup_perfctr(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (attr->type == PERF_TYPE_RAW) + if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) @@ -597,7 +613,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; - if (event->attr.type == PERF_TYPE_RAW) + if (event->attr.type == event->pmu->type) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { @@ -724,16 +740,33 @@ void x86_pmu_enable_all(int added) } } -static struct pmu pmu; - static inline int is_x86_event(struct perf_event *event) { - return event->pmu == &pmu; + int i; + + if (!is_hybrid()) + return event->pmu == &pmu; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) + return true; + } + + return false; } -struct pmu *x86_get_pmu(void) +struct pmu *x86_get_pmu(unsigned int cpu) { - return &pmu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + /* + * All CPUs of the hybrid type have been offline. + * The x86_get_pmu() should not be invoked. + */ + if (WARN_ON_ONCE(!cpuc->pmu)) + return &pmu; + + return cpuc->pmu; } /* * Event scheduler state: @@ -765,7 +798,7 @@ struct perf_sched { }; /* - * Initialize interator that runs through all events and counters. + * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) @@ -936,6 +969,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { + int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; @@ -1011,7 +1045,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { - int gpmax = x86_pmu.num_counters; + int gpmax = num_counters; /* * Do not allow scheduling of more than half the available @@ -1032,7 +1066,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { - gpmax = x86_pmu.num_counters - cpuc->n_pair; + gpmax = num_counters - cpuc->n_pair; WARN_ON(gpmax <= 0); } @@ -1096,8 +1130,9 @@ static void del_nr_metric_event(struct cpu_hw_events *cpuc, static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); - if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) + if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) @@ -1118,10 +1153,12 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { + int num_counters = hybrid(cpuc->pmu, num_counters); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; - max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; + max_count = num_counters + num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; @@ -1480,7 +1517,6 @@ static void x86_pmu_start(struct perf_event *event, int flags) cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); - __set_bit(idx, cpuc->running); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } @@ -1489,18 +1525,19 @@ void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; u64 pebs, debugctl; - struct cpu_hw_events *cpuc; + int cpu = smp_processor_id(); + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int num_counters = hybrid(cpuc->pmu, num_counters); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; - int cpu, idx; + int idx; - if (!x86_pmu.num_counters) + if (!num_counters) return; local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_events, cpu); - if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -1512,7 +1549,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - if (x86_pmu.pebs_constraints) { + if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } @@ -1523,7 +1560,7 @@ void perf_event_print_debug(void) } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < num_counters; idx++) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); @@ -1536,8 +1573,8 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - if (fixed_counter_disabled(idx)) + for (idx = 0; idx < num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); @@ -1573,6 +1610,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* @@ -1612,7 +1650,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; - if (x86_pmu.intel_cap.perf_metrics) + if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); @@ -1822,6 +1860,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, pmu_attr->event_str_noht); } +ssize_t events_hybrid_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr); + struct x86_hybrid_pmu *pmu; + const char *str, *next_str; + int i; + + if (hweight64(pmu_attr->pmu_type) == 1) + return sprintf(page, "%s", pmu_attr->event_str); + + /* + * Hybrid PMUs may support the same event name, but with different + * event encoding, e.g., the mem-loads event on an Atom PMU has + * different event encoding from a Core PMU. + * + * The event_str includes all event encodings. Each event encoding + * is divided by ";". The order of the event encodings must follow + * the order of the hybrid PMU index. + */ + pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + str = pmu_attr->event_str; + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) + continue; + if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { + next_str = strchr(str, ';'); + if (next_str) + return snprintf(page, next_str - str + 1, "%s", str); + else + return sprintf(page, "%s", str); + } + str = strchr(str, ';'); + str++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); + EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); @@ -1948,6 +2029,37 @@ static void _x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl) +{ + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & intel_ctrl)); + pr_info("... event mask: %016Lx\n", intel_ctrl); +} + +/* + * The generic code is not hybrid friendly. The hybrid_pmu->pmu + * of the first registered PMU is unconditionally assigned to + * each possible cpuctx->ctx.pmu. + * Update the correct hybrid PMU to the cpuctx->ctx.pmu. + */ +void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) +{ + struct perf_cpu_context *cpuctx; + + if (!pmu->pmu_cpu_context) + return; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->ctx.pmu = pmu; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1981,7 +2093,7 @@ static int __init init_hw_perf_events(void) pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) + if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) return 0; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -2008,15 +2120,11 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %lu\n", - hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1) - << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl)); - pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + if (!is_hybrid()) { + x86_pmu_show_pmu_cap(x86_pmu.num_counters, + x86_pmu.num_counters_fixed, + x86_pmu.intel_ctrl); + } if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; @@ -2046,9 +2154,46 @@ static int __init init_hw_perf_events(void) if (err) goto out1; - err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); - if (err) - goto out2; + if (!is_hybrid()) { + err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + if (err) + goto out2; + } else { + u8 cpu_type = get_this_hybrid_cpu_type(); + struct x86_hybrid_pmu *hybrid_pmu; + int i, j; + + if (!cpu_type && x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + hybrid_pmu = &x86_pmu.hybrid_pmu[i]; + + hybrid_pmu->pmu = pmu; + hybrid_pmu->pmu.type = -1; + hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; + + err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, + (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); + if (err) + break; + + if (cpu_type == hybrid_pmu->cpu_type) + x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id()); + } + + if (i < x86_pmu.num_hybrid_pmus) { + for (j = 0; j < i; j++) + perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); + pr_warn("Failed to register hybrid PMUs\n"); + kfree(x86_pmu.hybrid_pmu); + x86_pmu.hybrid_pmu = NULL; + x86_pmu.num_hybrid_pmus = 0; + goto out2; + } + } return 0; @@ -2173,16 +2318,27 @@ static void free_fake_cpuc(struct cpu_hw_events *cpuc) kfree(cpuc); } -static struct cpu_hw_events *allocate_fake_cpuc(void) +static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; - int cpu = raw_smp_processor_id(); + int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; + if (is_hybrid()) { + struct x86_hybrid_pmu *h_pmu; + + h_pmu = hybrid_pmu(event_pmu); + if (cpumask_empty(&h_pmu->supported_cpus)) + goto error; + cpu = cpumask_first(&h_pmu->supported_cpus); + } else + cpu = raw_smp_processor_id(); + cpuc->pmu = event_pmu; + if (intel_cpuc_prepare(cpuc, cpu)) goto error; @@ -2201,7 +2357,7 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = allocate_fake_cpuc(); + fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); @@ -2235,7 +2391,27 @@ static int validate_group(struct perf_event *event) struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; - fake_cpuc = allocate_fake_cpuc(); + /* + * Reject events from different hybrid PMUs. + */ + if (is_hybrid()) { + struct perf_event *sibling; + struct pmu *pmu = NULL; + + if (is_x86_event(leader)) + pmu = leader->pmu; + + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) + continue; + if (!pmu) + pmu = sibling->pmu; + else if (pmu != sibling->pmu) + return ret; + } + } + + fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* @@ -2263,35 +2439,26 @@ out: static int x86_pmu_event_init(struct perf_event *event) { - struct pmu *tmp; + struct x86_hybrid_pmu *pmu = NULL; int err; - switch (event->attr.type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - break; - - default: + if ((event->attr.type != event->pmu->type) && + (event->attr.type != PERF_TYPE_HARDWARE) && + (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; + + if (is_hybrid() && (event->cpu != -1)) { + pmu = hybrid_pmu(event->pmu); + if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) + return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { - /* - * we temporarily connect event to its pmu - * such that validate_group() can classify - * it as an x86 event using is_x86_event() - */ - tmp = event->pmu; - event->pmu = &pmu; - if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); - - event->pmu = tmp; } if (err) { if (event->destroy) @@ -2475,6 +2642,14 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } +static int x86_pmu_filter_match(struct perf_event *event) +{ + if (x86_pmu.filter_match) + return x86_pmu.filter_match(event); + + return 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2502,6 +2677,8 @@ static struct pmu pmu = { .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, + + .filter_match = x86_pmu_filter_match, }; void arch_perf_update_userpage(struct perf_event *event, @@ -2770,6 +2947,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { cap->version = x86_pmu.version; + /* + * KVM doesn't support the hybrid PMU yet. + * Return the common value in global x86_pmu, + * which available for all cores. + */ cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index e67a5886336c..10bde6c5abb2 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o -intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 731dd8d0dbb1..6320d2cfd9d3 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -594,7 +594,7 @@ static __init int bts_init(void) * we cannot use the user mapping since it will not be available * if we're not running the owning process. * - * With PTI we can't use the kernal map either, because its not + * With PTI we can't use the kernel map either, because its not * there when we run userspace. * * For now, disable this driver when using PTI. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 37ce38403cb8..2521d03de5e0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -137,7 +137,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ @@ -2076,6 +2076,14 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_grt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0), + EVENT_EXTRA_END +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2153,10 +2161,11 @@ static void intel_pmu_disable_all(void) static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, - x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); + intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -2186,7 +2195,7 @@ static void intel_pmu_enable_all(int added) * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either * in sequence on the same PMC or on different PMCs. * - * In practise it appears some of these events do in fact count, and + * In practice it appears some of these events do in fact count, and * we need to program all 4 events. */ static void intel_pmu_nhm_workaround(void) @@ -2429,13 +2438,23 @@ static int icl_set_topdown_event_period(struct perf_event *event) return 0; } +static int adl_set_topdown_event_period(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_set_topdown_event_period(event); +} + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; /* * The metric is reported as an 8bit integer fraction - * suming up to 0xff. + * summing up to 0xff. * slots-in-metric = (Metric / 0xff) * slots */ val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; @@ -2569,6 +2588,17 @@ static u64 icl_update_topdown_event(struct perf_event *event) x86_pmu.num_topdown_events - 1); } +static u64 adl_update_topdown_event(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_update_topdown_event(event); +} + + static void intel_pmu_read_topdown_event(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2709,22 +2739,25 @@ int intel_pmu_save_and_restart(struct perf_event *event) static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + int num_counters = hybrid(cpuc->pmu, num_counters); unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!num_counters) return; local_irq_save(flags); pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < num_counters; idx++) { wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - if (fixed_counter_disabled(idx)) + for (idx = 0; idx < num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx, cpuc->pmu)) continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } @@ -2753,6 +2786,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; int handled = 0; + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); inc_irq_stat(apic_perf_irqs); @@ -2776,7 +2810,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * processing loop coming after that the function, otherwise * phony regular samples may be generated in the sampling buffer * not marked with the EXACT tag. Another possibility is to have - * one PEBS event and at least one non-PEBS event whic hoverflows + * one PEBS event and at least one non-PEBS event which overflows * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will * not be set, yet the overflow status bit for the PEBS counter will * be on Skylake. @@ -2798,7 +2832,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) handled++; x86_pmu.drain_pebs(regs, &data); - status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; + status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; /* * PMI throttle may be triggered, which stops the PEBS event. @@ -2824,7 +2858,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) } /* - * Intel Perf mertrics + * Intel Perf metrics */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; @@ -2961,8 +2995,10 @@ intel_vlbr_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx, u64 config) +static int intel_alt_er(struct cpu_hw_events *cpuc, + int idx, u64 config) { + struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs); int alt_idx = idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) @@ -2974,7 +3010,7 @@ static int intel_alt_er(int idx, u64 config) if (idx == EXTRA_REG_RSP_1) alt_idx = EXTRA_REG_RSP_0; - if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + if (config & ~extra_regs[alt_idx].valid_mask) return idx; return alt_idx; @@ -2982,15 +3018,16 @@ static int intel_alt_er(int idx, u64 config) static void intel_fixup_er(struct perf_event *event, int idx) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); event->hw.extra_reg.idx = idx; if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -3066,7 +3103,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx, reg->config); + idx = intel_alt_er(cpuc, idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -3131,10 +3168,11 @@ struct event_constraint * x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { + struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints); struct event_constraint *c; - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { + if (event_constraints) { + for_each_event_constraint(c, event_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; @@ -3142,7 +3180,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } } - return &unconstrained; + return &hybrid_var(cpuc->pmu, unconstrained); } static struct event_constraint * @@ -3646,6 +3684,23 @@ static inline bool is_mem_loads_aux_event(struct perf_event *event) return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); } +static inline bool require_mem_loads_aux_event(struct perf_event *event) +{ + if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX)) + return false; + + if (is_hybrid()) + return hybrid_pmu(event->pmu)->cpu_type == hybrid_big; + + return true; +} + +static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) +{ + union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap); + + return test_bit(idx, (unsigned long *)&intel_cap->capabilities); +} static int intel_pmu_hw_config(struct perf_event *event) { @@ -3702,7 +3757,8 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } - if (event->attr.type != PERF_TYPE_RAW) + if ((event->attr.type == PERF_TYPE_HARDWARE) || + (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; /* @@ -3715,7 +3771,7 @@ static int intel_pmu_hw_config(struct perf_event *event) * with a slots event as group leader. When the slots event * is used in a metrics group, it too cannot support sampling. */ - if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) { + if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { if (event->attr.config1 || event->attr.config2) return -EINVAL; @@ -3766,7 +3822,7 @@ static int intel_pmu_hw_config(struct perf_event *event) * event. The rule is to simplify the implementation of the check. * That's because perf cannot have a complete group at the moment. */ - if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX && + if (require_mem_loads_aux_event(event) && (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) && is_mem_loads_event(event)) { struct perf_event *leader = event->group_leader; @@ -3801,10 +3857,11 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask; if (x86_pmu.flags & PMU_FL_PEBS_ALL) arr[0].guest &= ~cpuc->pebs_enabled; else @@ -4042,6 +4099,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type == hybrid_big) + return spr_get_event_constraints(cpuc, idx, event); + else if (pmu->cpu_type == hybrid_small) + return tnt_get_event_constraints(cpuc, idx, event); + + WARN_ON(1); + return &emptyconstraint; +} + +static int adl_hw_config(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type == hybrid_big) + return hsw_hw_config(event); + else if (pmu->cpu_type == hybrid_small) + return intel_pmu_hw_config(event); + + WARN_ON(1); + return -EOPNOTSUPP; +} + +static u8 adl_get_hybrid_cpu_type(void) +{ + return hybrid_big; +} + /* * Broadwell: * @@ -4145,7 +4235,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { cpuc->pebs_record_size = x86_pmu.pebs_record_size; - if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { + if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) goto err; @@ -4199,12 +4289,62 @@ static void flip_smm_bit(void *data) } } +static bool init_hybrid_pmu(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + u8 cpu_type = get_this_hybrid_cpu_type(); + struct x86_hybrid_pmu *pmu = NULL; + int i; + + if (!cpu_type && x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) { + pmu = &x86_pmu.hybrid_pmu[i]; + break; + } + } + if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) { + cpuc->pmu = NULL; + return false; + } + + /* Only check and dump the PMU information for the first CPU */ + if (!cpumask_empty(&pmu->supported_cpus)) + goto end; + + if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) + return false; + + pr_info("%s PMU driver: ", pmu->name); + + if (pmu->intel_cap.pebs_output_pt_available) + pr_cont("PEBS-via-PT "); + + pr_cont("\n"); + + x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed, + pmu->intel_ctrl); + +end: + cpumask_set_cpu(cpu, &pmu->supported_cpus); + cpuc->pmu = &pmu->pmu; + + x86_pmu_update_cpu_context(&pmu->pmu, cpu); + + return true; +} + static void intel_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int core_id = topology_core_id(cpu); int i; + if (is_hybrid() && !init_hybrid_pmu(cpu)) + return; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. @@ -4222,8 +4362,16 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); - /* Disable perf metrics if any added CPU doesn't support it. */ - if (x86_pmu.intel_cap.perf_metrics) { + /* + * Disable perf metrics if any added CPU doesn't support it. + * + * Turn off the check for a hybrid architecture, because the + * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate + * the architecture features. The perf metrics is a model-specific + * feature for now. The corresponding bit should always be 0 on + * a hybrid platform, e.g., Alder Lake. + */ + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) { union perf_capabilities perf_cap; rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); @@ -4310,7 +4458,12 @@ void intel_cpuc_finish(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dead(int cpu) { - intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu)); + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + intel_cpuc_finish(cpuc); + + if (is_hybrid() && cpuc->pmu) + cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -4339,6 +4492,14 @@ static int intel_pmu_aux_output_match(struct perf_event *event) return is_intel_pt_event(event); } +static int intel_pmu_filter_match(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + unsigned int cpu = smp_processor_id(); + + return cpumask_test_cpu(cpu, &pmu->supported_cpus); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -4516,7 +4677,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), @@ -4594,7 +4755,7 @@ static bool check_msr(unsigned long msr, u64 mask) /* * Disable the check for real HW, so we don't - * mess with potentionaly enabled registers: + * mess with potentially enabled registers: */ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return true; @@ -4659,7 +4820,7 @@ static __init void intel_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", @@ -4879,7 +5040,7 @@ static void update_tfa_sched(void *ignored) * and if so force schedule out for all event types all contexts */ if (test_bit(3, cpuc->active_mask)) - perf_pmu_resched(x86_get_pmu()); + perf_pmu_resched(x86_get_pmu(smp_processor_id())); } static ssize_t show_sysctl_tfa(struct device *cdev, @@ -5041,8 +5202,299 @@ static const struct attribute_group *attr_update[] = { NULL, }; +EVENT_ATTR_STR_HYBRID(slots, slots_adl, "event=0x00,umask=0x4", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_adl, "event=0xc2,umask=0x0;event=0x00,umask=0x80", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-bad-spec, td_bad_spec_adl, "event=0x73,umask=0x0;event=0x00,umask=0x81", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_adl, "event=0x71,umask=0x0;event=0x00,umask=0x82", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_adl, "event=0x74,umask=0x0;event=0x00,umask=0x83", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-heavy-ops, td_heavy_ops_adl, "event=0x00,umask=0x84", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-br-mispredict, td_br_mis_adl, "event=0x00,umask=0x85", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-fetch-lat, td_fetch_lat_adl, "event=0x00,umask=0x86", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-mem-bound, td_mem_bound_adl, "event=0x00,umask=0x87", hybrid_big); + +static struct attribute *adl_hybrid_events_attrs[] = { + EVENT_PTR(slots_adl), + EVENT_PTR(td_retiring_adl), + EVENT_PTR(td_bad_spec_adl), + EVENT_PTR(td_fe_bound_adl), + EVENT_PTR(td_be_bound_adl), + EVENT_PTR(td_heavy_ops_adl), + EVENT_PTR(td_br_mis_adl), + EVENT_PTR(td_fetch_lat_adl), + EVENT_PTR(td_mem_bound_adl), + NULL, +}; + +/* Must be in IDX order */ +EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big); + +static struct attribute *adl_hybrid_mem_attrs[] = { + EVENT_PTR(mem_ld_adl), + EVENT_PTR(mem_st_adl), + EVENT_PTR(mem_ld_aux_adl), + NULL, +}; + +EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-conflict, tx_conflict_adl, "event=0x54,umask=0x1", hybrid_big); +EVENT_ATTR_STR_HYBRID(cycles-t, cycles_t_adl, "event=0x3c,in_tx=1", hybrid_big); +EVENT_ATTR_STR_HYBRID(cycles-ct, cycles_ct_adl, "event=0x3c,in_tx=1,in_tx_cp=1", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-capacity-read, tx_capacity_read_adl, "event=0x54,umask=0x80", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-capacity-write, tx_capacity_write_adl, "event=0x54,umask=0x2", hybrid_big); + +static struct attribute *adl_hybrid_tsx_attrs[] = { + EVENT_PTR(tx_start_adl), + EVENT_PTR(tx_abort_adl), + EVENT_PTR(tx_commit_adl), + EVENT_PTR(tx_capacity_read_adl), + EVENT_PTR(tx_capacity_write_adl), + EVENT_PTR(tx_conflict_adl), + EVENT_PTR(cycles_t_adl), + EVENT_PTR(cycles_ct_adl), + NULL, +}; + +FORMAT_ATTR_HYBRID(in_tx, hybrid_big); +FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big); +FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small); +FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small); +FORMAT_ATTR_HYBRID(frontend, hybrid_big); + +static struct attribute *adl_hybrid_extra_attr_rtm[] = { + FORMAT_HYBRID_PTR(in_tx), + FORMAT_HYBRID_PTR(in_tx_cp), + FORMAT_HYBRID_PTR(offcore_rsp), + FORMAT_HYBRID_PTR(ldlat), + FORMAT_HYBRID_PTR(frontend), + NULL, +}; + +static struct attribute *adl_hybrid_extra_attr[] = { + FORMAT_HYBRID_PTR(offcore_rsp), + FORMAT_HYBRID_PTR(ldlat), + FORMAT_HYBRID_PTR(frontend), + NULL, +}; + +static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr); + + return pmu->cpu_type & pmu_attr->pmu_type; +} + +static umode_t hybrid_events_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0; +} + +static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu) +{ + int cpu = cpumask_first(&pmu->supported_cpus); + + return (cpu >= nr_cpu_ids) ? -1 : cpu; +} + +static umode_t hybrid_tsx_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0; +} + +static umode_t hybrid_format_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_format_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0; +} + +static struct attribute_group hybrid_group_events_td = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_mem = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_tsx = { + .name = "events", + .is_visible = hybrid_tsx_is_visible, +}; + +static struct attribute_group hybrid_group_format_extra = { + .name = "format", + .is_visible = hybrid_format_is_visible, +}; + +static ssize_t intel_hybrid_get_attr_cpus(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus); +} + +static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL); +static struct attribute *intel_hybrid_cpus_attrs[] = { + &dev_attr_cpus.attr, + NULL, +}; + +static struct attribute_group hybrid_group_cpus = { + .attrs = intel_hybrid_cpus_attrs, +}; + +static const struct attribute_group *hybrid_attr_update[] = { + &hybrid_group_events_td, + &hybrid_group_events_mem, + &hybrid_group_events_tsx, + &group_caps_gen, + &group_caps_lbr, + &hybrid_group_format_extra, + &group_default, + &hybrid_group_cpus, + NULL, +}; + static struct attribute *empty_attrs; +static void intel_pmu_check_num_counters(int *num_counters, + int *num_counters_fixed, + u64 *intel_ctrl, u64 fixed_mask) +{ + if (*num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + *num_counters, INTEL_PMC_MAX_GENERIC); + *num_counters = INTEL_PMC_MAX_GENERIC; + } + *intel_ctrl = (1ULL << *num_counters) - 1; + + if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + *num_counters_fixed, INTEL_PMC_MAX_FIXED); + *num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; +} + +static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, + int num_counters, + int num_counters_fixed, + u64 intel_ctrl) +{ + struct event_constraint *c; + + if (!event_constraints) + return; + + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, event_constraints) { + /* + * Don't extend the topdown slots and metrics + * events to the generic counters. + */ + if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* + * Disable topdown slots and metrics events, + * if slots event is not in CPUID. + */ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl)) + c->idxmsk64 = 0; + c->weight = hweight64(c->idxmsk64); + continue; + } + + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= intel_ctrl; + + if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) + c->idxmsk64 |= (1ULL << num_counters) - 1; + } + c->idxmsk64 &= + ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed)); + c->weight = hweight64(c->idxmsk64); + } +} + +static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) +{ + struct extra_reg *er; + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (!extra_regs) + return; + + for (er = extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x11UL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } +} + +static void intel_pmu_check_hybrid_pmus(u64 fixed_mask) +{ + struct x86_hybrid_pmu *pmu; + int i; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + pmu = &x86_pmu.hybrid_pmu[i]; + + intel_pmu_check_num_counters(&pmu->num_counters, + &pmu->num_counters_fixed, + &pmu->intel_ctrl, + fixed_mask); + + if (pmu->intel_cap.perf_metrics) { + pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS; + } + + if (pmu->intel_cap.pebs_output_pt_available) + pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + + intel_pmu_check_event_constraints(pmu->event_constraints, + pmu->num_counters, + pmu->num_counters_fixed, + pmu->intel_ctrl); + + intel_pmu_check_extra_regs(pmu->extra_regs); + } +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -5053,12 +5505,11 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; - struct event_constraint *c; unsigned int fixed_mask; - struct extra_reg *er; bool pmem = false; int version, i; char *name; + struct x86_hybrid_pmu *pmu; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -5653,6 +6104,99 @@ __init int intel_pmu_init(void) name = "sapphire_rapids"; break; + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + /* + * Alder Lake has 2 types of CPU, core and atom. + * + * Initialize the common PerfMon capabilities here. + */ + x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS, + sizeof(struct x86_hybrid_pmu), + GFP_KERNEL); + if (!x86_pmu.hybrid_pmu) + return -ENOMEM; + static_branch_enable(&perf_is_hybrid); + x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; + + x86_pmu.late_ack = true; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(false); + x86_pmu.num_topdown_events = 8; + x86_pmu.update_topdown_event = adl_update_topdown_event; + x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; + + x86_pmu.filter_match = intel_pmu_filter_match; + x86_pmu.get_event_constraints = adl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + x86_pmu.limit_period = spr_limit_period; + x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; + /* + * The rtm_abort_event is used to check whether to enable GPRs + * for the RTM abort event. Atom doesn't have the RTM abort + * event. There is no harmful to set it in the common + * x86_pmu.rtm_abort_event. + */ + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + + td_attr = adl_hybrid_events_attrs; + mem_attr = adl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + adl_hybrid_extra_attr_rtm : adl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + pmu->name = "cpu_core"; + pmu->cpu_type = hybrid_big; + pmu->num_counters = x86_pmu.num_counters + 2; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + pmu->intel_cap.perf_metrics = 1; + pmu->intel_cap.pebs_output_pt_available = 0; + + memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); + memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); + pmu->event_constraints = intel_spr_event_constraints; + pmu->pebs_constraints = intel_spr_pebs_event_constraints; + pmu->extra_regs = intel_spr_extra_regs; + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + pmu->name = "cpu_atom"; + pmu->cpu_type = hybrid_small; + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->max_pebs_events = x86_pmu.max_pebs_events; + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + pmu->intel_cap.perf_metrics = 0; + pmu->intel_cap.pebs_output_pt_available = 1; + + memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); + memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); + pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + pmu->event_constraints = intel_slm_event_constraints; + pmu->pebs_constraints = intel_grt_pebs_event_constraints; + pmu->extra_regs = intel_grt_extra_regs; + pr_cont("Alderlake Hybrid events, "); + name = "alderlake_hybrid"; + break; + default: switch (x86_pmu.version) { case 1: @@ -5673,68 +6217,36 @@ __init int intel_pmu_init(void) snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name); + if (!is_hybrid()) { + group_events_td.attrs = td_attr; + group_events_mem.attrs = mem_attr; + group_events_tsx.attrs = tsx_attr; + group_format_extra.attrs = extra_attr; + group_format_extra_skl.attrs = extra_skl_attr; - group_events_td.attrs = td_attr; - group_events_mem.attrs = mem_attr; - group_events_tsx.attrs = tsx_attr; - group_format_extra.attrs = extra_attr; - group_format_extra_skl.attrs = extra_skl_attr; - - x86_pmu.attr_update = attr_update; - - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; + x86_pmu.attr_update = attr_update; + } else { + hybrid_group_events_td.attrs = td_attr; + hybrid_group_events_mem.attrs = mem_attr; + hybrid_group_events_tsx.attrs = tsx_attr; + hybrid_group_format_extra.attrs = extra_attr; - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + x86_pmu.attr_update = hybrid_attr_update; } - x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED; + intel_pmu_check_num_counters(&x86_pmu.num_counters, + &x86_pmu.num_counters_fixed, + &x86_pmu.intel_ctrl, + (u64)fixed_mask); /* AnyThread may be deprecated on arch perfmon v5 or later */ if (x86_pmu.intel_cap.anythread_deprecated) x86_pmu.format_attrs = intel_arch_formats_attr; - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - /* - * Don't extend the topdown slots and metrics - * events to the generic counters. - */ - if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { - /* - * Disable topdown slots and metrics events, - * if slots event is not in CPUID. - */ - if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl)) - c->idxmsk64 = 0; - c->weight = hweight64(c->idxmsk64); - continue; - } - - if (c->cmask == FIXED_EVENT_FLAGS) { - /* Disabled fixed counters which are not in CPUID */ - c->idxmsk64 &= x86_pmu.intel_ctrl; - - if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - } - c->idxmsk64 &= - ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); - c->weight = hweight64(c->idxmsk64); - } - } - + intel_pmu_check_event_constraints(x86_pmu.event_constraints, + x86_pmu.num_counters, + x86_pmu.num_counters_fixed, + x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. * E.g. KVM doesn't support LBR MSR @@ -5752,19 +6264,7 @@ __init int intel_pmu_init(void) if (x86_pmu.lbr_nr) pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); - /* - * Access extra MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support offcore event - * Check all extra_regs here. - */ - if (x86_pmu.extra_regs) { - for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x11UL); - /* Disable LBR select mapping */ - if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) - x86_pmu.lbr_sel_map = NULL; - } - } + intel_pmu_check_extra_regs(x86_pmu.extra_regs); /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { @@ -5773,9 +6273,12 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } - if (x86_pmu.intel_cap.perf_metrics) + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + if (is_hybrid()) + intel_pmu_check_hybrid_pmus((u64)fixed_mask); + return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 407eee5f6f95..433399069e27 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,7 +40,7 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL,TNT + * Available model: SLM,AMT,GLM,CNL,TNT,ADL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -51,46 +51,49 @@ * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL + * TNT,RKL,ADL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL,RKL + * ICL,TGL,RKL,ADL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT,RKL + * KBL,CML,ICL,TGL,TNT,RKL,ADL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, + * ADL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL + * TNT,RKL,ADL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL,RKL + * KBL,CML,ICL,TGL,RKL,ADL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL + * TNT,RKL,ADL * Scope: Package (physical package) * */ @@ -563,6 +566,20 @@ static const struct cstate_model icl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model adl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -650,6 +667,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index d32b302719fe..1ec8fd311f38 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -779,6 +779,13 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_grt_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_PLD_CONSTRAINT(0x5d0, 0xf), + INTEL_PSD_CONSTRAINT(0x6d0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -959,13 +966,14 @@ struct event_constraint intel_spr_pebs_event_constraints[] = { struct event_constraint *intel_pebs_constraints(struct perf_event *event) { + struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); struct event_constraint *c; if (!event->attr.precise_ip) return NULL; - if (x86_pmu.pebs_constraints) { - for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if (pebs_constraints) { + for_each_event_constraint(c, pebs_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; @@ -1007,6 +1015,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; + int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -1014,9 +1024,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) return; if (x86_pmu.flags & PMU_FL_PEBS_ALL) - reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; + reserved = max_pebs_events + num_counters_fixed; else - reserved = x86_pmu.max_pebs_events; + reserved = max_pebs_events; if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - @@ -1353,14 +1363,13 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) is_64bit = kernel_ip(to) || any_64bit_mode(regs); #endif insn_init(&insn, kaddr, size, is_64bit); - insn_get_length(&insn); + /* - * Make sure there was not a problem decoding the - * instruction and getting the length. This is - * doubly important because we have an infinite - * loop if insn.length=0. + * Make sure there was not a problem decoding the instruction. + * This is doubly important because we have an infinite loop if + * insn.length=0. */ - if (!insn.length) + if (insn_get_length(&insn)) break; to += insn.length; @@ -1805,7 +1814,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) * * [-period, 0] * - * the difference between two consequtive reads is: + * the difference between two consecutive reads is: * * A) value2 - value1; * when no overflows have happened in between, @@ -2072,6 +2081,8 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; @@ -2086,9 +2097,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d ds->pebs_index = ds->pebs_buffer_base; - mask = ((1ULL << x86_pmu.max_pebs_events) - 1) | - (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); - size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + mask = ((1ULL << max_pebs_events) - 1) | + (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); + size = INTEL_PMC_IDX_FIXED + num_counters_fixed; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, size); @@ -2192,7 +2203,7 @@ void __init intel_ds_init(void) PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; pebs_qual = "-baseline"; - x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { /* Only basic record supported */ x86_pmu.large_pebs_flags &= @@ -2205,9 +2216,9 @@ void __init intel_ds_init(void) } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); - if (x86_pmu.intel_cap.pebs_output_pt_available) { + if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); - x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } break; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 21890dacfcfe..76dbab6ac9fb 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -705,7 +705,7 @@ void intel_pmu_lbr_add(struct perf_event *event) void release_lbr_buffers(void) { - struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; + struct kmem_cache *kmem_cache; struct cpu_hw_events *cpuc; int cpu; @@ -714,6 +714,7 @@ void release_lbr_buffers(void) for_each_possible_cpu(cpu) { cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + kmem_cache = x86_get_pmu(cpu)->task_ctx_cache; if (kmem_cache && cpuc->lbr_xsave) { kmem_cache_free(kmem_cache, cpuc->lbr_xsave); cpuc->lbr_xsave = NULL; @@ -1198,7 +1199,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) /* * The LBR logs any address in the IP, even if the IP just * faulted. This means userspace can control the from address. - * Ensure we don't blindy read any address by validating it is + * Ensure we don't blindly read any address by validating it is * a known text address. */ if (kernel_text_address(from)) { @@ -1224,8 +1225,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); - insn_get_opcode(&insn); - if (!insn.opcode.got) + if (insn_get_opcode(&insn)) return X86_BR_ABORT; switch (insn.opcode.bytes[0]) { @@ -1262,8 +1262,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_INT; break; case 0xe8: /* call near rel */ - insn_get_immediate(&insn); - if (insn.immediate1.value == 0) { + if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { /* zero length call */ ret = X86_BR_ZERO_CALL; break; @@ -1279,7 +1278,9 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_JMP; break; case 0xff: /* call near absolute, call far absolute ind */ - insn_get_modrm(&insn); + if (insn_get_modrm(&insn)) + return X86_BR_ABORT; + ext = (insn.modrm.bytes[0] >> 3) & 0x7; switch (ext) { case 2: /* near ind call */ @@ -1609,7 +1610,7 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); if (lbr_from_signext_quirk_needed()) static_branch_enable(&lbr_from_quirk_key); @@ -1629,7 +1630,7 @@ __init void intel_pmu_lbr_init_skl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); /* * SW branch filter usage: @@ -1726,7 +1727,7 @@ static bool is_arch_lbr_xsave_available(void) void __init intel_pmu_arch_lbr_init(void) { - struct pmu *pmu = x86_get_pmu(); + struct pmu *pmu = x86_get_pmu(smp_processor_id()); union cpuid28_eax eax; union cpuid28_ebx ebx; union cpuid28_ecx ecx; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index a4cc66005ce8..7951a5dc73b6 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { @@ -45,7 +45,7 @@ struct p4_pebs_bind { * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of * event configuration to find out which values are to be * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT - * resgisters + * registers */ static struct p4_pebs_bind p4_pebs_bind_map[] = { P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), @@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config) (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } -static void p4_pmu_enable_event(struct perf_event *event) +static void __p4_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int thread = p4_ht_config_thread(hwc->config); @@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event) (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } +static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running); + +static void p4_pmu_enable_event(struct perf_event *event) +{ + int idx = event->hw.idx; + + __set_bit(idx, per_cpu(p4_running, smp_processor_id())); + __p4_pmu_enable_event(event); +} + static void p4_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -992,7 +1002,7 @@ static void p4_pmu_enable_all(int added) struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; - p4_pmu_enable_event(event); + __p4_pmu_enable_event(event); } } @@ -1012,7 +1022,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!test_bit(idx, cpuc->active_mask)) { /* catch in-flight IRQs */ - if (__test_and_clear_bit(idx, cpuc->running)) + if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id()))) handled++; continue; } @@ -1313,7 +1323,7 @@ static __initconst const struct x86_pmu p4_pmu = { .get_event_constraints = x86_get_event_constraints, /* * IF HT disabled we may need to use all - * ARCH_P4_MAX_CCCR counters simulaneously + * ARCH_P4_MAX_CCCR counters simultaneously * though leave it restricted at moment assuming * HT is on */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index e94af4a54d0d..915847655c06 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -362,7 +362,7 @@ static bool pt_event_valid(struct perf_event *event) /* * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config - * clears the assomption that BranchEn must always be enabled, + * clears the assumption that BranchEn must always be enabled, * as was the case with the first implementation of PT. * If this bit is not set, the legacy behavior is preserved * for compatibility with the older userspace. diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 33c8180d5a87..df7b07d7fdcb 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -4,8 +4,13 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "uncore.h" +#include "uncore_discovery.h" -static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static bool uncore_no_discover; +module_param(uncore_no_discover, bool, 0); +MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism " + "(default: enable the discovery mechanism)."); +struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; @@ -48,6 +53,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus) return die_id; } +int uncore_die_to_segment(int die) +{ + struct pci_bus *bus = NULL; + + /* Find first pci bus which attributes to specified die. */ + while ((bus = pci_find_next_bus(bus)) && + (die != uncore_pcibus_to_dieid(bus))) + ; + + return bus ? pci_domain_nr(bus) : -EINVAL; +} + static void uncore_free_pcibus_map(void) { struct pci2phy_map *map, *tmp; @@ -829,6 +846,34 @@ static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; +static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu) +{ + struct intel_uncore_type *type = pmu->type; + + /* + * No uncore block name in discovery table. + * Use uncore_type_&typeid_&boxid as name. + */ + if (!type->name) { + if (type->num_boxes == 1) + sprintf(pmu->name, "uncore_type_%u", type->type_id); + else { + sprintf(pmu->name, "uncore_type_%u_%d", + type->type_id, type->box_ids[pmu->pmu_idx]); + } + return; + } + + if (type->num_boxes == 1) { + if (strlen(type->name) > 0) + sprintf(pmu->name, "uncore_%s", type->name); + else + sprintf(pmu->name, "uncore"); + } else + sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx); + +} + static int uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -855,15 +900,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) pmu->pmu.attr_update = pmu->type->attr_update; } - if (pmu->type->num_boxes == 1) { - if (strlen(pmu->type->name) > 0) - sprintf(pmu->name, "uncore_%s", pmu->type->name); - else - sprintf(pmu->name, "uncore"); - } else { - sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, - pmu->pmu_idx); - } + uncore_get_pmu_name(pmu); ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (!ret) @@ -904,6 +941,10 @@ static void uncore_type_exit(struct intel_uncore_type *type) kfree(type->pmus); type->pmus = NULL; } + if (type->box_ids) { + kfree(type->box_ids); + type->box_ids = NULL; + } kfree(type->events_group); type->events_group = NULL; } @@ -1003,10 +1044,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die) return 0; } +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; + if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) && + pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) && + pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl)) + return &type->pmus[i]; + } + } + } + + return NULL; +} + /* * Find the PMU of a PCI device. * @pdev: The PCI device. * @ids: The ID table of the available PCI devices with a PMU. + * If NULL, search the whole uncore_pci_uncores. */ static struct intel_uncore_pmu * uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) @@ -1016,6 +1084,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) kernel_ulong_t data; unsigned int devfn; + if (!ids) + return uncore_pci_find_dev_pmu_from_types(pdev); + while (ids && ids->vendor) { if ((ids->vendor == pdev->vendor) && (ids->device == pdev->device)) { @@ -1174,7 +1245,8 @@ static void uncore_pci_remove(struct pci_dev *pdev) } static int uncore_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) + unsigned long action, void *data, + const struct pci_device_id *ids) { struct device *dev = data; struct pci_dev *pdev = to_pci_dev(dev); @@ -1185,7 +1257,7 @@ static int uncore_bus_notify(struct notifier_block *nb, if (action != BUS_NOTIFY_DEL_DEVICE) return NOTIFY_DONE; - pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table); + pmu = uncore_pci_find_dev_pmu(pdev, ids); if (!pmu) return NOTIFY_DONE; @@ -1197,8 +1269,15 @@ static int uncore_bus_notify(struct notifier_block *nb, return NOTIFY_OK; } -static struct notifier_block uncore_notifier = { - .notifier_call = uncore_bus_notify, +static int uncore_pci_sub_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, + uncore_pci_sub_driver->id_table); +} + +static struct notifier_block uncore_pci_sub_notifier = { + .notifier_call = uncore_pci_sub_bus_notify, }; static void uncore_pci_sub_driver_init(void) @@ -1239,13 +1318,55 @@ static void uncore_pci_sub_driver_init(void) ids++; } - if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier)) + if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier)) notify = false; if (!notify) uncore_pci_sub_driver = NULL; } +static int uncore_pci_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, NULL); +} + +static struct notifier_block uncore_pci_notifier = { + .notifier_call = uncore_pci_bus_notify, +}; + + +static void uncore_pci_pmus_register(void) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pdev; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; + pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl), + UNCORE_DISCOVERY_PCI_BUS(box_ctl), + UNCORE_DISCOVERY_PCI_DEVFN(box_ctl)); + if (!pdev) + continue; + pmu = &type->pmus[i]; + + uncore_pci_pmu_register(pdev, type, pmu, die); + } + } + } + + bus_register_notifier(&pci_bus_type, &uncore_pci_notifier); +} + static int __init uncore_pci_init(void) { size_t size; @@ -1262,12 +1383,15 @@ static int __init uncore_pci_init(void) if (ret) goto errtype; - uncore_pci_driver->probe = uncore_pci_probe; - uncore_pci_driver->remove = uncore_pci_remove; + if (uncore_pci_driver) { + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; - ret = pci_register_driver(uncore_pci_driver); - if (ret) - goto errtype; + ret = pci_register_driver(uncore_pci_driver); + if (ret) + goto errtype; + } else + uncore_pci_pmus_register(); if (uncore_pci_sub_driver) uncore_pci_sub_driver_init(); @@ -1290,8 +1414,11 @@ static void uncore_pci_exit(void) if (pcidrv_registered) { pcidrv_registered = false; if (uncore_pci_sub_driver) - bus_unregister_notifier(&pci_bus_type, &uncore_notifier); - pci_unregister_driver(uncore_pci_driver); + bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier); + if (uncore_pci_driver) + pci_unregister_driver(uncore_pci_driver); + else + bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier); uncore_types_exit(uncore_pci_uncores); kfree(uncore_extra_pci_dev); uncore_free_pcibus_map(); @@ -1625,6 +1752,11 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun adl_uncore_init __initconst = { + .cpu_init = adl_uncore_cpu_init, + .mmio_init = tgl_uncore_mmio_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1637,6 +1769,12 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = { .mmio_init = snr_uncore_mmio_init, }; +static const struct intel_uncore_init_fun generic_uncore_init __initconst = { + .cpu_init = intel_uncore_generic_uncore_cpu_init, + .pci_init = intel_uncore_generic_uncore_pci_init, + .mmio_init = intel_uncore_generic_uncore_mmio_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init), @@ -1673,6 +1811,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; @@ -1684,17 +1824,21 @@ static int __init intel_uncore_init(void) struct intel_uncore_init_fun *uncore_init; int pret = 0, cret = 0, mret = 0, ret; - id = x86_match_cpu(intel_uncore_match); - if (!id) - return -ENODEV; - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; __uncore_max_dies = topology_max_packages() * topology_max_die_per_package(); - uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + id = x86_match_cpu(intel_uncore_match); + if (!id) { + if (!uncore_no_discover && intel_uncore_has_discovery_tables()) + uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init; + else + return -ENODEV; + } else + uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + if (uncore_init->pci_init) { pret = uncore_init->pci_init(); if (!pret) @@ -1711,8 +1855,10 @@ static int __init intel_uncore_init(void) mret = uncore_mmio_init(); } - if (cret && pret && mret) - return -ENODEV; + if (cret && pret && mret) { + ret = -ENODEV; + goto free_discovery; + } /* Install hotplug callbacks to setup the targets for each package */ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, @@ -1727,6 +1873,8 @@ err: uncore_types_exit(uncore_msr_uncores); uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); +free_discovery: + intel_uncore_clear_discovery_tables(); return ret; } module_init(intel_uncore_init); @@ -1737,5 +1885,6 @@ static void __exit intel_uncore_exit(void) uncore_types_exit(uncore_msr_uncores); uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); + intel_uncore_clear_discovery_tables(); } module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index a3c6e1643ad2..291791002997 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -42,6 +42,7 @@ struct intel_uncore_pmu; struct intel_uncore_box; struct uncore_event_desc; struct freerunning_counters; +struct intel_uncore_topology; struct intel_uncore_type { const char *name; @@ -50,6 +51,7 @@ struct intel_uncore_type { int perf_ctr_bits; int fixed_ctr_bits; int num_freerunning_types; + int type_id; unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; @@ -57,6 +59,7 @@ struct intel_uncore_type { unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; + u64 *box_ctls; /* Unit ctrl addr of the first box of each die */ union { unsigned msr_offset; unsigned mmio_offset; @@ -65,7 +68,12 @@ struct intel_uncore_type { unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; - unsigned *msr_offsets; + union { + unsigned *msr_offsets; + unsigned *pci_offsets; + unsigned *mmio_offsets; + }; + unsigned *box_ids; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -80,7 +88,7 @@ struct intel_uncore_type { * to identify which platform component each PMON block of that type is * supposed to monitor. */ - u64 *topology; + struct intel_uncore_topology *topology; /* * Optional callbacks for managing mapping of Uncore units to PMONs */ @@ -169,6 +177,11 @@ struct freerunning_counters { unsigned *box_offsets; }; +struct intel_uncore_topology { + u64 configuration; + int segment; +}; + struct pci2phy_map { struct list_head list; int segment; @@ -177,6 +190,7 @@ struct pci2phy_map { struct pci2phy_map *__find_pci2phy_map(int segment); int uncore_pcibus_to_dieid(struct pci_bus *bus); +int uncore_die_to_segment(int die); ssize_t uncore_event_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -547,6 +561,7 @@ uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event); void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event); u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); +extern struct intel_uncore_type *empty_uncore[]; extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct intel_uncore_type **uncore_mmio_uncores; @@ -567,6 +582,7 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void adl_uncore_cpu_init(void); void tgl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c new file mode 100644 index 000000000000..aba9bff95413 --- /dev/null +++ b/arch/x86/events/intel/uncore_discovery.c @@ -0,0 +1,622 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Support Intel uncore PerfMon discovery mechanism. + * Copyright(c) 2021 Intel Corporation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "uncore.h" +#include "uncore_discovery.h" + +static struct rb_root discovery_tables = RB_ROOT; +static int num_discovered_types[UNCORE_ACCESS_MAX]; + +static bool has_generic_discovery_table(void) +{ + struct pci_dev *dev; + int dvsec; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL); + if (!dev) + return false; + + /* A discovery table device has the unique capability ID. */ + dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY); + pci_dev_put(dev); + if (dvsec) + return true; + + return false; +} + +static int logical_die_id; + +static int get_device_die_id(struct pci_dev *dev) +{ + int cpu, node = pcibus_to_node(dev->bus); + + /* + * If the NUMA info is not available, assume that the logical die id is + * continuous in the order in which the discovery table devices are + * detected. + */ + if (node < 0) + return logical_die_id++; + + for_each_cpu(cpu, cpumask_of_node(node)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) + return c->logical_die_id; + } + + /* + * All CPUs of a node may be offlined. For this case, + * the PCI and MMIO type of uncore blocks which are + * enumerated by the device will be unavailable. + */ + return -1; +} + +#define __node_2_type(cur) \ + rb_entry((cur), struct intel_uncore_discovery_type, node) + +static inline int __type_cmp(const void *key, const struct rb_node *b) +{ + struct intel_uncore_discovery_type *type_b = __node_2_type(b); + const u16 *type_id = key; + + if (type_b->type > *type_id) + return -1; + else if (type_b->type < *type_id) + return 1; + + return 0; +} + +static inline struct intel_uncore_discovery_type * +search_uncore_discovery_type(u16 type_id) +{ + struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp); + + return (node) ? __node_2_type(node) : NULL; +} + +static inline bool __type_less(struct rb_node *a, const struct rb_node *b) +{ + return (__node_2_type(a)->type < __node_2_type(b)->type); +} + +static struct intel_uncore_discovery_type * +add_uncore_discovery_type(struct uncore_unit_discovery *unit) +{ + struct intel_uncore_discovery_type *type; + + if (unit->access_type >= UNCORE_ACCESS_MAX) { + pr_warn("Unsupported access type %d\n", unit->access_type); + return NULL; + } + + type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL); + if (!type) + return NULL; + + type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL); + if (!type->box_ctrl_die) + goto free_type; + + type->access_type = unit->access_type; + num_discovered_types[type->access_type]++; + type->type = unit->box_type; + + rb_add(&type->node, &discovery_tables, __type_less); + + return type; + +free_type: + kfree(type); + + return NULL; + +} + +static struct intel_uncore_discovery_type * +get_uncore_discovery_type(struct uncore_unit_discovery *unit) +{ + struct intel_uncore_discovery_type *type; + + type = search_uncore_discovery_type(unit->box_type); + if (type) + return type; + + return add_uncore_discovery_type(unit); +} + +static void +uncore_insert_box_info(struct uncore_unit_discovery *unit, + int die, bool parsed) +{ + struct intel_uncore_discovery_type *type; + unsigned int *box_offset, *ids; + int i; + + if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset)) + return; + + if (parsed) { + type = search_uncore_discovery_type(unit->box_type); + if (WARN_ON_ONCE(!type)) + return; + /* Store the first box of each die */ + if (!type->box_ctrl_die[die]) + type->box_ctrl_die[die] = unit->ctl; + return; + } + + type = get_uncore_discovery_type(unit); + if (!type) + return; + + box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + if (!box_offset) + return; + + ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + if (!ids) + goto free_box_offset; + + /* Store generic information for the first box */ + if (!type->num_boxes) { + type->box_ctrl = unit->ctl; + type->box_ctrl_die[die] = unit->ctl; + type->num_counters = unit->num_regs; + type->counter_width = unit->bit_width; + type->ctl_offset = unit->ctl_offset; + type->ctr_offset = unit->ctr_offset; + *ids = unit->box_id; + goto end; + } + + for (i = 0; i < type->num_boxes; i++) { + ids[i] = type->ids[i]; + box_offset[i] = type->box_offset[i]; + + if (WARN_ON_ONCE(unit->box_id == ids[i])) + goto free_ids; + } + ids[i] = unit->box_id; + box_offset[i] = unit->ctl - type->box_ctrl; + kfree(type->ids); + kfree(type->box_offset); +end: + type->ids = ids; + type->box_offset = box_offset; + type->num_boxes++; + return; + +free_ids: + kfree(ids); + +free_box_offset: + kfree(box_offset); + +} + +static int parse_discovery_table(struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed) +{ + struct uncore_global_discovery global; + struct uncore_unit_discovery unit; + void __iomem *io_addr; + resource_size_t addr; + unsigned long size; + u32 val; + int i; + + pci_read_config_dword(dev, bar_offset, &val); + + if (val & UNCORE_DISCOVERY_MASK) + return -EINVAL; + + addr = (resource_size_t)(val & ~UNCORE_DISCOVERY_MASK); + size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; + io_addr = ioremap(addr, size); + if (!io_addr) + return -ENOMEM; + + /* Read Global Discovery State */ + memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery)); + if (uncore_discovery_invalid_unit(global)) { + pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n", + global.table1, global.ctl, global.table3); + iounmap(io_addr); + return -EINVAL; + } + iounmap(io_addr); + + size = (1 + global.max_units) * global.stride * 8; + io_addr = ioremap(addr, size); + if (!io_addr) + return -ENOMEM; + + /* Parsing Unit Discovery State */ + for (i = 0; i < global.max_units; i++) { + memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8), + sizeof(struct uncore_unit_discovery)); + + if (uncore_discovery_invalid_unit(unit)) + continue; + + if (unit.access_type >= UNCORE_ACCESS_MAX) + continue; + + uncore_insert_box_info(&unit, die, *parsed); + } + + *parsed = true; + iounmap(io_addr); + return 0; +} + +bool intel_uncore_has_discovery_tables(void) +{ + u32 device, val, entry_id, bar_offset; + int die, dvsec = 0, ret = true; + struct pci_dev *dev = NULL; + bool parsed = false; + + if (has_generic_discovery_table()) + device = UNCORE_DISCOVERY_TABLE_DEVICE; + else + device = PCI_ANY_ID; + + /* + * Start a new search and iterates through the list of + * the discovery table devices. + */ + while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) { + while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) { + pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val); + entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK; + if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON) + continue; + + pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val); + + if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) { + ret = false; + goto err; + } + bar_offset = UNCORE_DISCOVERY_BIR_BASE + + (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP; + + die = get_device_die_id(dev); + if (die < 0) + continue; + + parse_discovery_table(dev, die, bar_offset, &parsed); + } + } + + /* None of the discovery tables are available */ + if (!parsed) + ret = false; +err: + pci_dev_put(dev); + + return ret; +} + +void intel_uncore_clear_discovery_tables(void) +{ + struct intel_uncore_discovery_type *type, *next; + + rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) { + kfree(type->box_ctrl_die); + kfree(type); + } +} + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31"); + +static struct attribute *generic_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh.attr, + NULL, +}; + +static const struct attribute_group generic_uncore_format_group = { + .name = "format", + .attrs = generic_uncore_formats_attr, +}; + +static void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT); +} + +static void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ); +} + +static void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), 0); +} + +static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, 0); +} + +static struct intel_uncore_ops generic_uncore_msr_ops = { + .init_box = intel_generic_uncore_msr_init_box, + .disable_box = intel_generic_uncore_msr_disable_box, + .enable_box = intel_generic_uncore_msr_enable_box, + .disable_event = intel_generic_uncore_msr_disable_event, + .enable_event = intel_generic_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT); +} + +static void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ); +} + +static void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + pci_write_config_dword(pdev, box_ctl, 0); +} + +static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, 0); +} + +static u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops generic_uncore_pci_ops = { + .init_box = intel_generic_uncore_pci_init_box, + .disable_box = intel_generic_uncore_pci_disable_box, + .enable_box = intel_generic_uncore_pci_enable_box, + .disable_event = intel_generic_uncore_pci_disable_event, + .enable_event = intel_generic_uncore_pci_enable_event, + .read_counter = intel_generic_uncore_pci_read_counter, +}; + +#define UNCORE_GENERIC_MMIO_SIZE 0x4000 + +static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) +{ + struct intel_uncore_type *type = box->pmu->type; + + if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets) + return 0; + + return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx]; +} + +static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + unsigned int box_ctl = generic_uncore_mmio_box_ctl(box); + struct intel_uncore_type *type = box->pmu->type; + resource_size_t addr; + + if (!box_ctl) { + pr_warn("Uncore type %d box %d: Invalid box control address.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx]); + return; + } + + addr = box_ctl; + box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + if (!box->io_addr) { + pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx], + (unsigned long long)addr); + return; + } + + writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr); +} + +static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr); +} + +static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr); +} + +static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config, box->io_addr + hwc->config_base); +} + +static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(0, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_ops generic_uncore_mmio_ops = { + .init_box = intel_generic_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = intel_generic_uncore_mmio_disable_box, + .enable_box = intel_generic_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static bool uncore_update_uncore_type(enum uncore_access_type type_id, + struct intel_uncore_type *uncore, + struct intel_uncore_discovery_type *type) +{ + uncore->type_id = type->type; + uncore->num_boxes = type->num_boxes; + uncore->num_counters = type->num_counters; + uncore->perf_ctr_bits = type->counter_width; + uncore->box_ids = type->ids; + + switch (type_id) { + case UNCORE_ACCESS_MSR: + uncore->ops = &generic_uncore_msr_ops; + uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset; + uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset; + uncore->box_ctl = (unsigned int)type->box_ctrl; + uncore->msr_offsets = type->box_offset; + break; + case UNCORE_ACCESS_PCI: + uncore->ops = &generic_uncore_pci_ops; + uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset; + uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset; + uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl); + uncore->box_ctls = type->box_ctrl_die; + uncore->pci_offsets = type->box_offset; + break; + case UNCORE_ACCESS_MMIO: + uncore->ops = &generic_uncore_mmio_ops; + uncore->perf_ctr = (unsigned int)type->ctr_offset; + uncore->event_ctl = (unsigned int)type->ctl_offset; + uncore->box_ctl = (unsigned int)type->box_ctrl; + uncore->box_ctls = type->box_ctrl_die; + uncore->mmio_offsets = type->box_offset; + uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE; + break; + default: + return false; + } + + return true; +} + +static struct intel_uncore_type ** +intel_uncore_generic_init_uncores(enum uncore_access_type type_id) +{ + struct intel_uncore_discovery_type *type; + struct intel_uncore_type **uncores; + struct intel_uncore_type *uncore; + struct rb_node *node; + int i = 0; + + uncores = kcalloc(num_discovered_types[type_id] + 1, + sizeof(struct intel_uncore_type *), GFP_KERNEL); + if (!uncores) + return empty_uncore; + + for (node = rb_first(&discovery_tables); node; node = rb_next(node)) { + type = rb_entry(node, struct intel_uncore_discovery_type, node); + if (type->access_type != type_id) + continue; + + uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL); + if (!uncore) + break; + + uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK; + uncore->format_group = &generic_uncore_format_group; + + if (!uncore_update_uncore_type(type_id, uncore, type)) { + kfree(uncore); + continue; + } + uncores[i++] = uncore; + } + + return uncores; +} + +void intel_uncore_generic_uncore_cpu_init(void) +{ + uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR); +} + +int intel_uncore_generic_uncore_pci_init(void) +{ + uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI); + + return 0; +} + +void intel_uncore_generic_uncore_mmio_init(void) +{ + uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO); +} diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h new file mode 100644 index 000000000000..1d652939a01c --- /dev/null +++ b/arch/x86/events/intel/uncore_discovery.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* Generic device ID of a discovery table device */ +#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 +/* Capability ID for a discovery table device */ +#define UNCORE_EXT_CAP_ID_DISCOVERY 0x23 +/* First DVSEC offset */ +#define UNCORE_DISCOVERY_DVSEC_OFFSET 0x8 +/* Mask of the supported discovery entry type */ +#define UNCORE_DISCOVERY_DVSEC_ID_MASK 0xffff +/* PMON discovery entry type ID */ +#define UNCORE_DISCOVERY_DVSEC_ID_PMON 0x1 +/* Second DVSEC offset */ +#define UNCORE_DISCOVERY_DVSEC2_OFFSET 0xc +/* Mask of the discovery table BAR offset */ +#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK 0x7 +/* Discovery table BAR base offset */ +#define UNCORE_DISCOVERY_BIR_BASE 0x10 +/* Discovery table BAR step */ +#define UNCORE_DISCOVERY_BIR_STEP 0x4 +/* Mask of the discovery table offset */ +#define UNCORE_DISCOVERY_MASK 0xf +/* Global discovery table size */ +#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20 + +#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7) +#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff) +#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff) +#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff) + + +#define uncore_discovery_invalid_unit(unit) \ + (!unit.table1 || !unit.ctl || !unit.table3 || \ + unit.table1 == -1ULL || unit.ctl == -1ULL || \ + unit.table3 == -1ULL) + +#define GENERIC_PMON_CTL_EV_SEL_MASK 0x000000ff +#define GENERIC_PMON_CTL_UMASK_MASK 0x0000ff00 +#define GENERIC_PMON_CTL_EDGE_DET (1 << 18) +#define GENERIC_PMON_CTL_INVERT (1 << 23) +#define GENERIC_PMON_CTL_TRESH_MASK 0xff000000 +#define GENERIC_PMON_RAW_EVENT_MASK (GENERIC_PMON_CTL_EV_SEL_MASK | \ + GENERIC_PMON_CTL_UMASK_MASK | \ + GENERIC_PMON_CTL_EDGE_DET | \ + GENERIC_PMON_CTL_INVERT | \ + GENERIC_PMON_CTL_TRESH_MASK) + +#define GENERIC_PMON_BOX_CTL_FRZ (1 << 0) +#define GENERIC_PMON_BOX_CTL_RST_CTRL (1 << 8) +#define GENERIC_PMON_BOX_CTL_RST_CTRS (1 << 9) +#define GENERIC_PMON_BOX_CTL_INT (GENERIC_PMON_BOX_CTL_RST_CTRL | \ + GENERIC_PMON_BOX_CTL_RST_CTRS) + +enum uncore_access_type { + UNCORE_ACCESS_MSR = 0, + UNCORE_ACCESS_MMIO, + UNCORE_ACCESS_PCI, + + UNCORE_ACCESS_MAX, +}; + +struct uncore_global_discovery { + union { + u64 table1; + struct { + u64 type : 8, + stride : 8, + max_units : 10, + __reserved_1 : 36, + access_type : 2; + }; + }; + + u64 ctl; /* Global Control Address */ + + union { + u64 table3; + struct { + u64 status_offset : 8, + num_status : 16, + __reserved_2 : 40; + }; + }; +}; + +struct uncore_unit_discovery { + union { + u64 table1; + struct { + u64 num_regs : 8, + ctl_offset : 8, + bit_width : 8, + ctr_offset : 8, + status_offset : 8, + __reserved_1 : 22, + access_type : 2; + }; + }; + + u64 ctl; /* Unit Control Address */ + + union { + u64 table3; + struct { + u64 box_type : 16, + box_id : 16, + __reserved_2 : 32; + }; + }; +}; + +struct intel_uncore_discovery_type { + struct rb_node node; + enum uncore_access_type access_type; + u64 box_ctrl; /* Unit ctrl addr of the first box */ + u64 *box_ctrl_die; /* Unit ctrl addr of the first box of each die */ + u16 type; /* Type ID of the uncore block */ + u8 num_counters; + u8 counter_width; + u8 ctl_offset; /* Counter Control 0 offset */ + u8 ctr_offset; /* Counter 0 offset */ + u16 num_boxes; /* number of boxes for the uncore block */ + unsigned int *ids; /* Box IDs */ + unsigned int *box_offset; /* Box offset */ +}; + +bool intel_uncore_has_discovery_tables(void); +void intel_uncore_clear_discovery_tables(void); +void intel_uncore_generic_uncore_cpu_init(void); +int intel_uncore_generic_uncore_pci_init(void); +void intel_uncore_generic_uncore_mmio_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 51271288499e..0f63706cdadf 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -62,6 +62,8 @@ #define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 #define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 #define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 +#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660 +#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -131,12 +133,33 @@ #define ICL_UNC_ARB_PER_CTR 0x3b1 #define ICL_UNC_ARB_PERFEVTSEL 0x3b3 +/* ADL uncore global control */ +#define ADL_UNC_PERF_GLOBAL_CTL 0x2ff0 +#define ADL_UNC_FIXED_CTR_CTRL 0x2fde +#define ADL_UNC_FIXED_CTR 0x2fdf + +/* ADL Cbo register */ +#define ADL_UNC_CBO_0_PER_CTR0 0x2002 +#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000 +#define ADL_UNC_CTL_THRESHOLD 0x3f000000 +#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + ADL_UNC_CTL_THRESHOLD) + +/* ADL ARB register */ +#define ADL_UNC_ARB_PER_CTR0 0x2FD2 +#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0 +#define ADL_UNC_ARB_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29"); /* Sandy Bridge uncore support */ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -422,6 +445,106 @@ void tgl_uncore_cpu_init(void) skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; } +static void adl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + +static void adl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + +static void adl_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); +} + +static void adl_uncore_msr_exit_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); +} + +static struct intel_uncore_ops adl_uncore_msr_ops = { + .init_box = adl_uncore_msr_init_box, + .enable_box = adl_uncore_msr_enable_box, + .disable_box = adl_uncore_msr_disable_box, + .exit_box = adl_uncore_msr_exit_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct attribute *adl_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_threshold.attr, + NULL, +}; + +static const struct attribute_group adl_uncore_format_group = { + .name = "format", + .attrs = adl_uncore_formats_attr, +}; + +static struct intel_uncore_type adl_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .perf_ctr_bits = 44, + .perf_ctr = ADL_UNC_CBO_0_PER_CTR0, + .event_ctl = ADL_UNC_CBO_0_PERFEVTSEL0, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = ICL_UNC_CBO_MSR_OFFSET, + .ops = &adl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type adl_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 44, + .perf_ctr = ADL_UNC_ARB_PER_CTR0, + .event_ctl = ADL_UNC_ARB_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = ADL_UNC_ARB_MSR_OFFSET, + .constraints = snb_uncore_arb_constraints, + .ops = &adl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type adl_uncore_clockbox = { + .name = "clock", + .num_counters = 1, + .num_boxes = 1, + .fixed_ctr_bits = 48, + .fixed_ctr = ADL_UNC_FIXED_CTR, + .fixed_ctl = ADL_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &adl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *adl_msr_uncores[] = { + &adl_uncore_cbox, + &adl_uncore_arb, + &adl_uncore_clockbox, + NULL, +}; + +void adl_uncore_cpu_init(void) +{ + adl_uncore_cbox.num_boxes = icl_get_cbox_num(); + uncore_msr_uncores = adl_msr_uncores; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -1203,6 +1326,14 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b79951d0707c..63f097289a84 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -280,17 +280,17 @@ * | [63] | 00h | VALID - When set, indicates the CPU bus * numbers have been initialized. (RO) * |[62:48]| --- | Reserved - * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned * CPUBUSNO(5). (RO) - * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned * CPUBUSNO(4). (RO) - * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned * CPUBUSNO(3). (RO) - * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned * CPUBUSNO(2). (RO) - * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned * CPUBUSNO(1). (RO) - * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned * CPUBUSNO(0). (RO) */ #define SKX_MSR_CPU_BUS_NUMBER 0x300 @@ -1159,7 +1159,6 @@ enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, BDX_PCI_QPI_PORT2_FILTER, - HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { NULL, }; -void hswep_uncore_cpu_init(void) +#define HSWEP_PCU_DID 0x2fc0 +#define HSWEP_PCU_CAPID4_OFFET 0x94 +#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3) + +static bool hswep_has_limit_sbox(unsigned int device) { - int pkg = boot_cpu_data.logical_proc_id; + struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + u32 capid4; + + if (!dev) + return false; + + pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + if (!hswep_get_chop(capid4)) + return true; + return false; +} + +void hswep_uncore_cpu_init(void) +{ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } + if (hswep_has_limit_sbox(HSWEP_PCU_DID)) + hswep_uncore_sbox.num_boxes = 2; uncore_msr_uncores = hswep_msr_uncores; } @@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { EVENT_CONSTRAINT_END }; +#define BDX_PCU_DID 0x6fc0 + void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) { - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; /* Detect systems with no SBOXes */ - } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - struct pci_dev *pdev; - u32 capid4; - - pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; - pci_read_config_dword(pdev, 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - } + if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, BDX_PCI_QPI_PORT2_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3684,7 +3675,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) { - return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); + return pmu->type->topology[die].configuration >> + (pmu->pmu_idx * BUS_NUM_STRIDE); } static umode_t @@ -3697,19 +3689,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) } static ssize_t skx_iio_mapping_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { - struct pci_bus *bus = pci_find_next_bus(NULL); - struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); long die = (long)ea->var; - /* - * Current implementation is for single segment configuration hence it's - * safe to take the segment value from the first available root bus. - */ - return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), - skx_iio_stack(uncore_pmu, die)); + return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment, + skx_iio_stack(pmu, die)); } static int skx_msr_cpu_bus_read(int cpu, u64 *topology) @@ -3746,34 +3733,32 @@ static int die_to_cpu(int die) static int skx_iio_get_topology(struct intel_uncore_type *type) { - int i, ret; - struct pci_bus *bus = NULL; + int die, ret = -EPERM; - /* - * Verified single-segment environments only; disabled for multiple - * segment topologies for now except VMD domains. - * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. - */ - while ((bus = pci_find_next_bus(bus)) - && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) - ; - if (bus) - return -EPERM; - - type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); + type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), + GFP_KERNEL); if (!type->topology) return -ENOMEM; - for (i = 0; i < uncore_max_dies(); i++) { - ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); - if (ret) { - kfree(type->topology); - type->topology = NULL; - return ret; - } + for (die = 0; die < uncore_max_dies(); die++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(die), + &type->topology[die].configuration); + if (ret) + break; + + ret = uncore_die_to_segment(die); + if (ret < 0) + break; + + type->topology[die].segment = ret; } - return 0; + if (ret < 0) { + kfree(type->topology); + type->topology = NULL; + } + + return ret; } static struct attribute_group skx_iio_mapping_group = { @@ -3794,7 +3779,7 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) struct dev_ext_attribute *eas = NULL; ret = skx_iio_get_topology(type); - if (ret) + if (ret < 0) goto clear_attr_update; ret = -ENOMEM; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 680404c58cb1..c853b28efa33 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -100,6 +100,8 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: case INTEL_FAM6_ROCKETLAKE: + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 53b2b5fc23bc..27fa85e7d4fd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -15,6 +15,7 @@ #include <linux/perf_event.h> #include <asm/intel_ds.h> +#include <asm/cpu.h> /* To enable MSR tracing please use the generic trace points. */ @@ -228,7 +229,6 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; /* the # of events in the below arrays */ @@ -327,6 +327,8 @@ struct cpu_hw_events { int n_pair; /* Large increment events */ void *kfree_on_online[X86_PERF_KFREE_MAX]; + + struct pmu *pmu; }; #define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ @@ -630,6 +632,71 @@ enum { x86_lbr_exclusive_max, }; +struct x86_hybrid_pmu { + struct pmu pmu; + const char *name; + u8 cpu_type; + cpumask_t supported_cpus; + union perf_capabilities intel_cap; + u64 intel_ctrl; + int max_pebs_events; + int num_counters; + int num_counters_fixed; + struct event_constraint unconstrained; + + u64 hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + u64 hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + struct event_constraint *event_constraints; + struct event_constraint *pebs_constraints; + struct extra_reg *extra_regs; +}; + +static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) +{ + return container_of(pmu, struct x86_hybrid_pmu, pmu); +} + +extern struct static_key_false perf_is_hybrid; +#define is_hybrid() static_branch_unlikely(&perf_is_hybrid) + +#define hybrid(_pmu, _field) \ +(*({ \ + typeof(&x86_pmu._field) __Fp = &x86_pmu._field; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = &hybrid_pmu(_pmu)->_field; \ + \ + __Fp; \ +})) + +#define hybrid_var(_pmu, _var) \ +(*({ \ + typeof(&_var) __Fp = &_var; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = &hybrid_pmu(_pmu)->_var; \ + \ + __Fp; \ +})) + +enum hybrid_pmu_type { + hybrid_big = 0x40, + hybrid_small = 0x20, + + hybrid_big_small = hybrid_big | hybrid_small, +}; + +#define X86_HYBRID_PMU_ATOM_IDX 0 +#define X86_HYBRID_PMU_CORE_IDX 1 + +#define X86_HYBRID_NUM_PMUS 2 + /* * struct x86_pmu - generic x86 pmu */ @@ -816,6 +883,19 @@ struct x86_pmu { int (*check_period) (struct perf_event *event, u64 period); int (*aux_output_match) (struct perf_event *event); + + int (*filter_match)(struct perf_event *event); + /* + * Hybrid support + * + * Most PMU capabilities are the same among different hybrid PMUs. + * The global x86_pmu saves the architecture capabilities, which + * are available for all PMUs. The hybrid_pmu only includes the + * unique capabilities. + */ + int num_hybrid_pmus; + struct x86_hybrid_pmu *hybrid_pmu; + u8 (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { @@ -905,7 +985,23 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } -struct pmu *x86_get_pmu(void); +#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \ +static struct perf_pmu_events_hybrid_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\ + .id = 0, \ + .event_str = str, \ + .pmu_type = _pmu, \ +} + +#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr) + +#define FORMAT_ATTR_HYBRID(_name, _pmu) \ +static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ + .attr = __ATTR_RO(_name), \ + .pmu_type = _pmu, \ +} + +struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) @@ -964,6 +1060,9 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } +bool check_hw_exists(struct pmu *pmu, int num_counters, + int num_counters_fixed); + int x86_add_exclusive(unsigned int what); void x86_del_exclusive(unsigned int what); @@ -1027,6 +1126,11 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl); + +void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu); + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; @@ -1067,10 +1171,15 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +ssize_t events_hybrid_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page); -static inline bool fixed_counter_disabled(int i) +static inline bool fixed_counter_disabled(int i, struct pmu *pmu) { - return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + return !(intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); } #ifdef CONFIG_CPU_SUP_AMD @@ -1154,6 +1263,8 @@ extern struct event_constraint intel_glm_pebs_event_constraints[]; extern struct event_constraint intel_glp_pebs_event_constraints[]; +extern struct event_constraint intel_grt_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index f42a70496a24..84a1042c3b01 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -800,6 +800,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index e68827e604ad..949d845c922b 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -494,7 +494,7 @@ static __init void zhaoxin_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 284e73661a18..90e682a92820 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -60,9 +60,11 @@ static u32 hv_apic_read(u32 reg) switch (reg) { case APIC_EOI: rdmsr(HV_X64_MSR_EOI, reg_val, hi); + (void)hi; return reg_val; case APIC_TASKPRI: rdmsr(HV_X64_MSR_TPR, reg_val, hi); + (void)hi; return reg_val; default: @@ -103,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; - int ret = 1; + u64 status = HV_STATUS_INVALID_PARAMETER; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; @@ -128,19 +130,19 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) if (!nr_bank) ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; - ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, + status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); - return ((ret == 0) ? true : false); + return hv_result_success(status); } static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; struct hv_send_ipi ipi_arg; - int ret = 1; + u64 status; trace_hyperv_send_ipi_mask(mask, vector); @@ -184,9 +186,9 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask); } - ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, + status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, ipi_arg.cpu_mask); - return ((ret == 0) ? true : false); + return hv_result_success(status); do_ex_hypercall: return __send_ipi_mask_ex(mask, vector); @@ -195,6 +197,7 @@ do_ex_hypercall: static bool __send_ipi_one(int cpu, int vector) { int vp = hv_cpu_number_to_vp_number(cpu); + u64 status; trace_hyperv_send_ipi_one(cpu, vector); @@ -207,7 +210,8 @@ static bool __send_ipi_one(int cpu, int vector) if (vp >= 64) return __send_ipi_mask_ex(cpumask_of(cpu), vector); - return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); + status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); + return hv_result_success(status); } static void hv_send_ipi(int cpu, int vector) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index b81047dec1da..bb0ae4b5c00f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -54,28 +54,6 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); u32 hv_max_vp_index; EXPORT_SYMBOL_GPL(hv_max_vp_index); -void *hv_alloc_hyperv_page(void) -{ - BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE); - - return (void *)__get_free_page(GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); - -void *hv_alloc_hyperv_zeroed_page(void) -{ - BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE); - - return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); -} -EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); - -void hv_free_hyperv_page(unsigned long addr) -{ - free_page(addr); -} -EXPORT_SYMBOL_GPL(hv_free_hyperv_page); - static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -97,7 +75,7 @@ static int hv_cpu_init(unsigned int cpu) *output_arg = page_address(pg + 1); } - hv_get_vp_index(msr_vp_index); + msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); hv_vp_index[smp_processor_id()] = msr_vp_index; @@ -162,7 +140,7 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); static inline bool hv_reenlightenment_available(void) { /* - * Check for required features and priviliges to make TSC frequency + * Check for required features and privileges to make TSC frequency * change notifications work. */ return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && @@ -292,7 +270,7 @@ static int hv_suspend(void) /* * Reset the hypercall page as it is going to be invalidated - * accross hibernation. Setting hv_hypercall_pg to NULL ensures + * across hibernation. Setting hv_hypercall_pg to NULL ensures * that any subsequent hypercall operation fails safely instead of * crashing due to an access of an invalid page. The hypercall page * pointer is restored on resume. @@ -349,7 +327,7 @@ static void __init hv_stimer_setup_percpu_clockev(void) * Ignore any errors in setting up stimer clockevents * as we can run with the LAPIC timer as a fallback. */ - (void)hv_stimer_alloc(); + (void)hv_stimer_alloc(false); /* * Still register the LAPIC timer, because the direct-mode STIMER is @@ -369,7 +347,7 @@ static void __init hv_get_partition_id(void) local_irq_save(flags); output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); - if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + if (!hv_result_success(status)) { /* No point in proceeding if this failed */ pr_err("Failed to get partition ID: %lld\n", status); BUG(); @@ -520,6 +498,8 @@ void __init hyperv_init(void) x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; #endif + /* Query the VMs extended capability once, so that it can be cached. */ + hv_query_ext_cap(0); return; remove_cpuhp_state: @@ -593,33 +573,6 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) } EXPORT_SYMBOL_GPL(hyperv_report_panic); -/** - * hyperv_report_panic_msg - report panic message to Hyper-V - * @pa: physical address of the panic page containing the message - * @size: size of the message in the page - */ -void hyperv_report_panic_msg(phys_addr_t pa, size_t size) -{ - /* - * P3 to contain the physical address of the panic page & P4 to - * contain the size of the panic data in that page. Rest of the - * registers are no-op when the NOTIFY_MSG flag is set. - */ - wrmsrl(HV_X64_MSR_CRASH_P0, 0); - wrmsrl(HV_X64_MSR_CRASH_P1, 0); - wrmsrl(HV_X64_MSR_CRASH_P2, 0); - wrmsrl(HV_X64_MSR_CRASH_P3, pa); - wrmsrl(HV_X64_MSR_CRASH_P4, size); - - /* - * Let Hyper-V know there is crash data available along with - * the panic message. - */ - wrmsrl(HV_X64_MSR_CRASH_CTL, - (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); -} -EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); - bool hv_is_hyperv_initialized(void) { union hv_x64_msr_hypercall_contents hypercall_msr; @@ -650,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported); enum hv_isolation_type hv_get_isolation_type(void) { - if (!(ms_hyperv.features_b & HV_ISOLATION)) + if (!(ms_hyperv.priv_high & HV_ISOLATION)) return HV_ISOLATION_TYPE_NONE; return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); } @@ -661,3 +614,50 @@ bool hv_is_isolation_supported(void) return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; } EXPORT_SYMBOL_GPL(hv_is_isolation_supported); + +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ +bool hv_query_ext_cap(u64 cap_query) +{ + /* + * The address of the 'hv_extended_cap' variable will be used as an + * output parameter to the hypercall below and so it should be + * compatible with 'virt_to_phys'. Which means, it's address should be + * directly mapped. Use 'static' to keep it compatible; stack variables + * can be virtually mapped, making them imcompatible with + * 'virt_to_phys'. + * Hypercall input/output addresses should also be 8-byte aligned. + */ + static u64 hv_extended_cap __aligned(8); + static bool hv_extended_cap_queried; + u64 status; + + /* + * Querying extended capabilities is an extended hypercall. Check if the + * partition supports extended hypercall, first. + */ + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) + return false; + + /* Extended capabilities do not change at runtime. */ + if (hv_extended_cap_queried) + return hv_extended_cap & cap_query; + + status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, + &hv_extended_cap); + + /* + * The query extended capabilities hypercall should not fail under + * any normal circumstances. Avoid repeatedly making the hypercall, on + * error. + */ + hv_extended_cap_queried = true; + status &= HV_HYPERCALL_RESULT_MASK; + if (status != HV_STATUS_SUCCESS) { + pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", + status); + return false; + } + + return hv_extended_cap & cap_query; +} +EXPORT_SYMBOL_GPL(hv_query_ext_cap); diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c index 60461e598239..68a0843d4750 100644 --- a/arch/x86/hyperv/hv_proc.c +++ b/arch/x86/hyperv/hv_proc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> -#include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> @@ -93,10 +92,9 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, page_count, 0, input_page, NULL); local_irq_restore(flags); - - if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + if (!hv_result_success(status)) { pr_err("Failed to deposit pages: %lld\n", status); - ret = status; + ret = hv_result(status); goto err_free_allocations; } @@ -122,7 +120,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) struct hv_add_logical_processor_out *output; u64 status; unsigned long flags; - int ret = 0; + int ret = HV_STATUS_SUCCESS; int pxm = node_to_pxm(node); /* @@ -148,13 +146,11 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) input, output); local_irq_restore(flags); - status &= HV_HYPERCALL_RESULT_MASK; - - if (status != HV_STATUS_INSUFFICIENT_MEMORY) { - if (status != HV_STATUS_SUCCESS) { + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_success(status)) { pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, lp_index, apic_id, status); - ret = status; + ret = hv_result(status); } break; } @@ -169,7 +165,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) struct hv_create_vp *input; u64 status; unsigned long irq_flags; - int ret = 0; + int ret = HV_STATUS_SUCCESS; int pxm = node_to_pxm(node); /* Root VPs don't seem to need pages deposited */ @@ -200,13 +196,11 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); local_irq_restore(irq_flags); - status &= HV_HYPERCALL_RESULT_MASK; - - if (status != HV_STATUS_INSUFFICIENT_MEMORY) { - if (status != HV_STATUS_SUCCESS) { + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + if (!hv_result_success(status)) { pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, vp_index, flags, status); - ret = status; + ret = hv_result(status); } break; } diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index f3270c1fc48c..91cfe698bde0 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -25,7 +25,6 @@ static void hv_qlock_kick(int cpu) static void hv_qlock_wait(u8 *byte, u8 val) { - unsigned long msr_val; unsigned long flags; if (in_nmi()) @@ -48,8 +47,13 @@ static void hv_qlock_wait(u8 *byte, u8 val) /* * Only issue the rdmsrl() when the lock state has not changed. */ - if (READ_ONCE(*byte) == val) + if (READ_ONCE(*byte) == val) { + unsigned long msr_val; + rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + + (void)msr_val; + } local_irq_restore(flags); } diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 4421a8d92e23..514fc64e23d5 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -63,10 +63,10 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, local_irq_restore(flags); - if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) + if (!hv_result_success(status)) pr_err("%s: hypercall failed, status %lld\n", __func__, status); - return status & HV_HYPERCALL_RESULT_MASK; + return hv_result(status); } static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) @@ -88,7 +88,7 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); local_irq_restore(flags); - return status & HV_HYPERCALL_RESULT_MASK; + return hv_result(status); } #ifdef CONFIG_PCI_MSI diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 2c87350c1fb0..c0ba8874d9cb 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -58,7 +58,7 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, int cpu, vcpu, gva_n, max_gvas; struct hv_tlb_flush **flush_pcpu; struct hv_tlb_flush *flush; - u64 status = U64_MAX; + u64 status; unsigned long flags; trace_hyperv_mmu_flush_tlb_others(cpus, info); @@ -161,7 +161,7 @@ do_ex_hypercall: check_status: local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) return; do_native: native_flush_tlb_others(cpus, info); @@ -176,7 +176,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, u64 status; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - return U64_MAX; + return HV_STATUS_INVALID_PARAMETER; flush_pcpu = (struct hv_tlb_flush_ex **) this_cpu_ptr(hyperv_pcpu_input_arg); @@ -201,7 +201,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); if (nr_bank < 0) - return U64_MAX; + return HV_STATUS_INVALID_PARAMETER; /* * We can flush not more than max_gvas with one hypercall. Flush the diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index dd0a843f766d..5d70968c8538 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -47,7 +47,7 @@ int hyperv_flush_guest_mapping(u64 as) flush, NULL); local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) ret = 0; fault: @@ -92,7 +92,7 @@ int hyperv_flush_guest_mapping_range(u64 as, { struct hv_guest_mapping_flush_list **flush_pcpu; struct hv_guest_mapping_flush_list *flush; - u64 status = 0; + u64 status; unsigned long flags; int ret = -ENOTSUPP; int gpa_n = 0; @@ -125,10 +125,10 @@ int hyperv_flush_guest_mapping_range(u64 as, local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) ret = 0; else - ret = status; + ret = hv_result(status); fault: trace_hyperv_nested_flush_guest_mapping_range(as, ret); return ret; diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 62da760d6d5a..cd7b14322035 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -9,7 +9,7 @@ * Functions to keep the agpgart mappings coherent with the MMU. The * GART gives the CPU a physical alias of pages in memory. The alias * region is mapped uncacheable. Make sure there are no conflicting - * mappings with different cachability attributes for the same + * mappings with different cacheability attributes for the same * page. This avoids data corruption on some CPUs. */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h deleted file mode 100644 index 464034db299f..000000000000 --- a/arch/x86/include/asm/alternative-asm.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_ALTERNATIVE_ASM_H -#define _ASM_X86_ALTERNATIVE_ASM_H - -#ifdef __ASSEMBLY__ - -#include <asm/asm.h> - -#ifdef CONFIG_SMP - .macro LOCK_PREFIX -672: lock - .pushsection .smp_locks,"a" - .balign 4 - .long 672b - . - .popsection - .endm -#else - .macro LOCK_PREFIX - .endm -#endif - -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -.macro ANNOTATE_IGNORE_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.ignore_alts - .long .Lannotate_\@ - . - .popsection -.endm - -/* - * Issue one struct alt_instr descriptor entry (need to put it into - * the section .altinstructions, see below). This entry contains - * enough information for the alternatives patching code to patch an - * instruction. See apply_alternatives(). - */ -.macro altinstruction_entry orig alt feature orig_len alt_len pad_len - .long \orig - . - .long \alt - . - .word \feature - .byte \orig_len - .byte \alt_len - .byte \pad_len -.endm - -/* - * Define an alternative between two instructions. If @feature is - * present, early code in apply_alternatives() replaces @oldinstr with - * @newinstr. ".skip" directive takes care of proper instruction padding - * in case @newinstr is longer than @oldinstr. - */ -.macro ALTERNATIVE oldinstr, newinstr, feature -140: - \oldinstr -141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr -144: - .popsection -.endm - -#define old_len 141b-140b -#define new_len1 144f-143f -#define new_len2 145f-144f - -/* - * gas compatible max based on the idea from: - * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax - * - * The additional "-" is needed because gas uses a "true" value of -1. - */ -#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) - - -/* - * Same as ALTERNATIVE macro above but for two alternatives. If CPU - * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has - * @feature2, it replaces @oldinstr with @feature2. - */ -.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 -140: - \oldinstr -141: - .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ - (alt_max_short(new_len1, new_len2) - (old_len)),0x90 -142: - - .pushsection .altinstructions,"a" - altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b - altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b - .popsection - - .pushsection .altinstr_replacement,"ax" -143: - \newinstr1 -144: - \newinstr2 -145: - .popsection -.endm - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 13adca37c99a..a3c2315aca12 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -2,13 +2,17 @@ #ifndef _ASM_X86_ALTERNATIVE_H #define _ASM_X86_ALTERNATIVE_H -#ifndef __ASSEMBLY__ - #include <linux/types.h> -#include <linux/stddef.h> #include <linux/stringify.h> #include <asm/asm.h> +#define ALTINSTR_FLAG_INV (1 << 15) +#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) + +#ifndef __ASSEMBLY__ + +#include <linux/stddef.h> + /* * Alternative inline assembly for SMP. * @@ -61,7 +65,6 @@ struct alt_instr { u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction */ - u8 padlen; /* length of build-time padding */ } __packed; /* @@ -100,7 +103,6 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alt_end_marker "663" #define alt_slen "662b-661b" -#define alt_pad_len alt_end_marker"b-662b" #define alt_total_slen alt_end_marker"b-661b" #define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" @@ -147,10 +149,9 @@ static inline int alternatives_text_reserved(void *start, void *end) " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte " alt_total_slen "\n" /* source len */ \ - " .byte " alt_rlen(num) "\n" /* replacement len */ \ - " .byte " alt_pad_len "\n" /* pad len */ + " .byte " alt_rlen(num) "\n" /* replacement len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ +#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \ "# ALT: replacement " #num "\n" \ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n" @@ -161,7 +162,7 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ALTINSTR_REPLACEMENT(newinstr, 1) \ ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ @@ -171,10 +172,15 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ - ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ALTINSTR_REPLACEMENT(newinstr1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, 2) \ ".popsection\n" +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature) + #define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \ OLDINSTR_3(oldinsn, 1, 2, 3) \ ".pushsection .altinstructions,\"a\"\n" \ @@ -183,9 +189,9 @@ static inline int alternatives_text_reserved(void *start, void *end) ALTINSTR_ENTRY(feat3, 3) \ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ - ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \ - ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \ - ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \ + ALTINSTR_REPLACEMENT(newinsn1, 1) \ + ALTINSTR_REPLACEMENT(newinsn2, 2) \ + ALTINSTR_REPLACEMENT(newinsn3, 3) \ ".popsection\n" /* @@ -206,15 +212,15 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") +#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \ + asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory") + /* * Alternative inline assembly with input. * * Peculiarities: * No memory clobber here. * Argument numbers start with 1. - * Best is to use constraints that are fixed size (like (%1) ... "r") - * If you use variable sized constraints like "m" or "g" in the - * replacement make sure to pad to the worst case length. * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ @@ -271,6 +277,115 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +672: lock + .pushsection .smp_locks,"a" + .balign 4 + .long 672b - . + .popsection + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro altinstruction_entry orig alt feature orig_len alt_len + .long \orig - . + .long \alt - . + .word \feature + .byte \orig_len + .byte \alt_len +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * gas compatible max based on the idea from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + + +/* + * Same as ALTERNATIVE macro above but for two alternatives. If CPU + * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has + * @feature2, it replaces @oldinstr with @feature2. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection +.endm + +/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */ +#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \ + ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \ + newinstr_yes, feature + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 51e2bf27cc9b..4cb726c71ed8 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -19,18 +19,19 @@ extern void cmpxchg8b_emu(void); #ifdef CONFIG_RETPOLINE -#define DECL_INDIRECT_THUNK(reg) \ +#undef GEN +#define GEN(reg) \ extern asmlinkage void __x86_indirect_thunk_ ## reg (void); - -#define DECL_RETPOLINE(reg) \ - extern asmlinkage void __x86_retpoline_ ## reg (void); +#include <asm/GEN-for-each-reg.h> #undef GEN -#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#define GEN(reg) \ + extern asmlinkage void __x86_indirect_alt_call_ ## reg (void); #include <asm/GEN-for-each-reg.h> #undef GEN -#define GEN(reg) DECL_RETPOLINE(reg) +#define GEN(reg) \ + extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void); #include <asm/GEN-for-each-reg.h> #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 4d4ec5cbdc51..94fbe6ae7431 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -22,7 +22,7 @@ extern void __add_wrong_size(void) /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to * -1 because sizeof will never return -1, thereby making those switch - * case statements guaranteeed dead code which the compiler will + * case statements guaranteed dead code which the compiler will * eliminate, and allowing the "missing symbol in the default case" to * indicate a usage error. */ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index da78ccbd493b..33d41e350c79 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -41,12 +41,14 @@ unsigned int x86_family(unsigned int sig); unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); #ifdef CONFIG_CPU_SUP_INTEL -extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c); +extern void __init sld_setup(struct cpuinfo_x86 *c); extern void switch_to_sld(unsigned long tifn); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); +extern void handle_bus_lock(struct pt_regs *regs); +u8 get_this_hybrid_cpu_type(void); #else -static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {} +static inline void __init sld_setup(struct cpuinfo_x86 *c) {} static inline void switch_to_sld(unsigned long tifn) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) { @@ -57,6 +59,13 @@ static inline bool handle_guest_split_lock(unsigned long ip) { return false; } + +static inline void handle_bus_lock(struct pt_regs *regs) {} + +static inline u8 get_this_hybrid_cpu_type(void) +{ + return 0; +} #endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1728d4ce5730..16a51e7288d5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -8,6 +8,7 @@ #include <asm/asm.h> #include <linux/bitops.h> +#include <asm/alternative.h> enum cpuid_leafs { @@ -175,39 +176,15 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); */ static __always_inline bool _static_cpu_has(u16 bit) { - asm_volatile_goto("1: jmp 6f\n" - "2:\n" - ".skip -(((5f-4f) - (2b-1b)) > 0) * " - "((5f-4f) - (2b-1b)),0x90\n" - "3:\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 4f - .\n" /* repl offset */ - " .word %P[always]\n" /* always replace */ - " .byte 3b - 1b\n" /* src len */ - " .byte 5f - 4f\n" /* repl len */ - " .byte 3b - 2b\n" /* pad len */ - ".previous\n" - ".section .altinstr_replacement,\"ax\"\n" - "4: jmp %l[t_no]\n" - "5:\n" - ".previous\n" - ".section .altinstructions,\"a\"\n" - " .long 1b - .\n" /* src offset */ - " .long 0\n" /* no replacement */ - " .word %P[feature]\n" /* feature bit */ - " .byte 3b - 1b\n" /* src len */ - " .byte 0\n" /* repl len */ - " .byte 0\n" /* pad len */ - ".previous\n" - ".section .altinstr_aux,\"ax\"\n" - "6:\n" - " testb %[bitnum],%[cap_byte]\n" - " jnz %l[t_yes]\n" - " jmp %l[t_no]\n" - ".previous\n" + asm_volatile_goto( + ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]") + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" : : [feature] "i" (bit), - [always] "i" (X86_FEATURE_ALWAYS), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index cc96e26d69f7..3c94316169a3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -84,7 +84,7 @@ /* CPU types for specific tunings: */ #define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */ #define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ #define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ @@ -236,6 +236,8 @@ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ #define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ +#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ +#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -290,6 +292,8 @@ #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ #define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */ #define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */ +#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */ +#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -354,6 +358,7 @@ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ +#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */ #define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */ #define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */ #define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */ @@ -374,6 +379,7 @@ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ +#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ #define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9224d40cdefe..7d7500806af8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -283,12 +283,12 @@ extern u32 elf_hwcap2; * * The decision process for determining the results are: * - * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | - * ELF: | | | | + * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 | + * ELF: | | | | * ---------------------|------------|------------------|----------------| - * missing PT_GNU_STACK | exec-all | exec-all | exec-none | - * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | - * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | + * missing PT_GNU_STACK | exec-all | exec-all | exec-none | + * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack | + * PT_GNU_STACK == RW | exec-none | exec-none | exec-none | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 2b87b191b3b8..14ebd2196569 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_ENTRY_COMMON_H #define _ASM_X86_ENTRY_COMMON_H +#include <linux/randomize_kstack.h> #include <linux/user-return-notifier.h> #include <asm/nospec-branch.h> @@ -70,6 +71,21 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, */ current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED); #endif + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for x86 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 8 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints (see cc_stack_align4/8 in + * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32) + * low bits from any entropy chosen here. + * + * Therefore, final stack offset entropy will be 5 (x86_64) or + * 6 (ia32) bits. + */ + choose_random_kstack_offset(rdtsc() & 0xFF); } #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index d43717b423cb..6ec3fc969ad5 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -74,7 +74,6 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) int lcount; char *lptr; - st = 1; for (lcount = virtual_dma_count, lptr = virtual_dma_addr; lcount; lcount--, lptr++) { st = inb(virtual_dma_port + FD_STATUS); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e6cd3fee562b..606f5cc579b2 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -156,7 +156,7 @@ enum hv_isolation_type { #define HV_X64_MSR_HYPERCALL 0x40000001 /* MSR used to provide vcpu index */ -#define HV_X64_MSR_VP_INDEX 0x40000002 +#define HV_REGISTER_VP_INDEX 0x40000002 /* MSR used to reset the guest OS. */ #define HV_X64_MSR_RESET 0x40000003 @@ -165,10 +165,10 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_RUNTIME 0x40000010 /* MSR used to read the per-partition time reference counter */ -#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_REGISTER_TIME_REF_COUNT 0x40000020 /* A partition's reference time stamp counter (TSC) page */ -#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_REGISTER_REFERENCE_TSC 0x40000021 /* MSR used to retrieve the TSC frequency */ #define HV_X64_MSR_TSC_FREQUENCY 0x40000022 @@ -183,50 +183,50 @@ enum hv_isolation_type { #define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 /* Define synthetic interrupt controller model specific registers. */ -#define HV_X64_MSR_SCONTROL 0x40000080 -#define HV_X64_MSR_SVERSION 0x40000081 -#define HV_X64_MSR_SIEFP 0x40000082 -#define HV_X64_MSR_SIMP 0x40000083 -#define HV_X64_MSR_EOM 0x40000084 -#define HV_X64_MSR_SINT0 0x40000090 -#define HV_X64_MSR_SINT1 0x40000091 -#define HV_X64_MSR_SINT2 0x40000092 -#define HV_X64_MSR_SINT3 0x40000093 -#define HV_X64_MSR_SINT4 0x40000094 -#define HV_X64_MSR_SINT5 0x40000095 -#define HV_X64_MSR_SINT6 0x40000096 -#define HV_X64_MSR_SINT7 0x40000097 -#define HV_X64_MSR_SINT8 0x40000098 -#define HV_X64_MSR_SINT9 0x40000099 -#define HV_X64_MSR_SINT10 0x4000009A -#define HV_X64_MSR_SINT11 0x4000009B -#define HV_X64_MSR_SINT12 0x4000009C -#define HV_X64_MSR_SINT13 0x4000009D -#define HV_X64_MSR_SINT14 0x4000009E -#define HV_X64_MSR_SINT15 0x4000009F +#define HV_REGISTER_SCONTROL 0x40000080 +#define HV_REGISTER_SVERSION 0x40000081 +#define HV_REGISTER_SIEFP 0x40000082 +#define HV_REGISTER_SIMP 0x40000083 +#define HV_REGISTER_EOM 0x40000084 +#define HV_REGISTER_SINT0 0x40000090 +#define HV_REGISTER_SINT1 0x40000091 +#define HV_REGISTER_SINT2 0x40000092 +#define HV_REGISTER_SINT3 0x40000093 +#define HV_REGISTER_SINT4 0x40000094 +#define HV_REGISTER_SINT5 0x40000095 +#define HV_REGISTER_SINT6 0x40000096 +#define HV_REGISTER_SINT7 0x40000097 +#define HV_REGISTER_SINT8 0x40000098 +#define HV_REGISTER_SINT9 0x40000099 +#define HV_REGISTER_SINT10 0x4000009A +#define HV_REGISTER_SINT11 0x4000009B +#define HV_REGISTER_SINT12 0x4000009C +#define HV_REGISTER_SINT13 0x4000009D +#define HV_REGISTER_SINT14 0x4000009E +#define HV_REGISTER_SINT15 0x4000009F /* * Synthetic Timer MSRs. Four timers per vcpu. */ -#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 -#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 -#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 -#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 -#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 -#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 -#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 -#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 +#define HV_REGISTER_STIMER0_CONFIG 0x400000B0 +#define HV_REGISTER_STIMER0_COUNT 0x400000B1 +#define HV_REGISTER_STIMER1_CONFIG 0x400000B2 +#define HV_REGISTER_STIMER1_COUNT 0x400000B3 +#define HV_REGISTER_STIMER2_CONFIG 0x400000B4 +#define HV_REGISTER_STIMER2_COUNT 0x400000B5 +#define HV_REGISTER_STIMER3_CONFIG 0x400000B6 +#define HV_REGISTER_STIMER3_COUNT 0x400000B7 /* Hyper-V guest idle MSR */ #define HV_X64_MSR_GUEST_IDLE 0x400000F0 /* Hyper-V guest crash notification MSR's */ -#define HV_X64_MSR_CRASH_P0 0x40000100 -#define HV_X64_MSR_CRASH_P1 0x40000101 -#define HV_X64_MSR_CRASH_P2 0x40000102 -#define HV_X64_MSR_CRASH_P3 0x40000103 -#define HV_X64_MSR_CRASH_P4 0x40000104 -#define HV_X64_MSR_CRASH_CTL 0x40000105 +#define HV_REGISTER_CRASH_P0 0x40000100 +#define HV_REGISTER_CRASH_P1 0x40000101 +#define HV_REGISTER_CRASH_P2 0x40000102 +#define HV_REGISTER_CRASH_P3 0x40000103 +#define HV_REGISTER_CRASH_P4 0x40000104 +#define HV_REGISTER_CRASH_CTL 0x40000105 /* TSC emulation after migration */ #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 @@ -236,6 +236,32 @@ enum hv_isolation_type { /* TSC invariant control */ #define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +/* Register name aliases for temporary compatibility */ +#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT +#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG +#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT +#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG +#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT +#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG +#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT +#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG +#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL +#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION +#define HV_X64_MSR_SIMP HV_REGISTER_SIMP +#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP +#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX +#define HV_X64_MSR_EOM HV_REGISTER_EOM +#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0 +#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15 +#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0 +#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1 +#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2 +#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3 +#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4 +#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL +#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT +#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC + /* * Declare the MSR used to setup pages used to communicate with the hypervisor. */ @@ -288,35 +314,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 - -/* Define hypervisor message types. */ -enum hv_message_type { - HVMSG_NONE = 0x00000000, - - /* Memory access messages. */ - HVMSG_UNMAPPED_GPA = 0x80000000, - HVMSG_GPA_INTERCEPT = 0x80000001, - - /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, - - /* Error messages. */ - HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, - HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, - - /* Trace buffer complete messages. */ - HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, - - /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, - HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, - HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 -}; - struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 5eb3bdf36a41..e35e342673c7 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -547,7 +547,7 @@ SYM_CODE_END(spurious_entries_start) /* * Dummy trap number so the low level ASM macro vector number checks do not * match which results in emitting plain IDTENTRY stubs without bells and - * whistels. + * whistles. */ #define X86_TRAP_OTHER 0xFFFF diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h index 4cf2ad521f65..b56c5741581a 100644 --- a/arch/x86/include/asm/inat.h +++ b/arch/x86/include/asm/inat.h @@ -6,7 +6,7 @@ * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ -#include <asm/inat_types.h> +#include <asm/inat_types.h> /* __ignore_sync_check__ */ /* * Internal bits. Don't use bitmasks directly, because these bits are diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h index 98b4dae5e8bc..91d7182ad2d6 100644 --- a/arch/x86/include/asm/insn-eval.h +++ b/arch/x86/include/asm/insn-eval.h @@ -25,7 +25,7 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]); -bool insn_decode(struct insn *insn, struct pt_regs *regs, - unsigned char buf[MAX_INSN_SIZE], int buf_size); +bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size); #endif /* _ASM_X86_INSN_EVAL_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 95a448fbb44c..05a6ab940f45 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -9,7 +9,7 @@ #include <asm/byteorder.h> /* insn_attr_t is defined in inat.h */ -#include <asm/inat.h> +#include <asm/inat.h> /* __ignore_sync_check__ */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) @@ -132,13 +132,25 @@ struct insn { #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); -extern void insn_get_prefixes(struct insn *insn); -extern void insn_get_opcode(struct insn *insn); -extern void insn_get_modrm(struct insn *insn); -extern void insn_get_sib(struct insn *insn); -extern void insn_get_displacement(struct insn *insn); -extern void insn_get_immediate(struct insn *insn); -extern void insn_get_length(struct insn *insn); +extern int insn_get_prefixes(struct insn *insn); +extern int insn_get_opcode(struct insn *insn); +extern int insn_get_modrm(struct insn *insn); +extern int insn_get_sib(struct insn *insn); +extern int insn_get_displacement(struct insn *insn); +extern int insn_get_immediate(struct insn *insn); +extern int insn_get_length(struct insn *insn); + +enum insn_mode { + INSN_MODE_32, + INSN_MODE_64, + /* Mode is determined by the current kernel build. */ + INSN_MODE_KERN, + INSN_NUM_MODES, +}; + +extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m); + +#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN) /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) @@ -149,17 +161,6 @@ static inline void insn_get_attribute(struct insn *insn) /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); -/* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, - const void *kaddr, int buf_len) -{ -#ifdef CONFIG_X86_64 - insn_init(insn, kaddr, buf_len, 1); -#else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, buf_len, 0); -#endif -} - static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) @@ -179,13 +180,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn) return !!insn->emulate_prefix_size; } -/* Ensure this instruction is decoded completely */ -static inline int insn_complete(struct insn *insn) -{ - return insn->opcode.got && insn->modrm.got && insn->sib.got && - insn->displacement.got && insn->immediate.got; -} - static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9abe842dbd84..955b06d6325a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -32,7 +32,9 @@ * _EP - 2 socket server parts * _EX - 4+ socket server parts * - * The #define line may optionally include a comment including platform names. + * The #define line may optionally include a comment including platform or core + * names. An exception is made for skylake/kabylake where steppings seem to have gotten + * their own names :-( */ /* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ @@ -69,35 +71,41 @@ #define INTEL_FAM6_BROADWELL_X 0x4F #define INTEL_FAM6_BROADWELL_D 0x56 -#define INTEL_FAM6_SKYLAKE_L 0x4E -#define INTEL_FAM6_SKYLAKE 0x5E -#define INTEL_FAM6_SKYLAKE_X 0x55 -#define INTEL_FAM6_KABYLAKE_L 0x8E -#define INTEL_FAM6_KABYLAKE 0x9E +#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */ +#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */ +#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */ +/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */ +/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */ -#define INTEL_FAM6_CANNONLAKE_L 0x66 +#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */ +/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */ +/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */ +/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */ -#define INTEL_FAM6_ICELAKE_X 0x6A -#define INTEL_FAM6_ICELAKE_D 0x6C -#define INTEL_FAM6_ICELAKE 0x7D -#define INTEL_FAM6_ICELAKE_L 0x7E -#define INTEL_FAM6_ICELAKE_NNPI 0x9D +#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */ +/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */ -#define INTEL_FAM6_TIGERLAKE_L 0x8C -#define INTEL_FAM6_TIGERLAKE 0x8D +#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */ +#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */ -#define INTEL_FAM6_COMETLAKE 0xA5 -#define INTEL_FAM6_COMETLAKE_L 0xA6 +#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */ -#define INTEL_FAM6_ROCKETLAKE 0xA7 +#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */ +#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */ +#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */ +#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ +#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F +#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ -/* Hybrid Core/Atom Processors */ +#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ -#define INTEL_FAM6_LAKEFIELD 0x8A -#define INTEL_FAM6_ALDERLAKE 0x97 -#define INTEL_FAM6_ALDERLAKE_L 0x9A +#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ +#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ +#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Willow Cove */ + +#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ +#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ /* "Small Core" Processors (Atom) */ diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h index 3cb002b1d0f9..994638ef171b 100644 --- a/arch/x86/include/asm/intel_pconfig.h +++ b/arch/x86/include/asm/intel_pconfig.h @@ -38,7 +38,7 @@ enum pconfig_leaf { #define MKTME_INVALID_ENC_ALG 4 #define MKTME_DEVICE_BUSY 5 -/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */ +/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */ struct mktme_key_program { u16 keyid; u32 keyid_ctrl; diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 423b788f495e..ebe8d2ea44fe 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -3,7 +3,7 @@ #define _ASM_X86_INTEL_PT_H #define PT_CPUID_LEAVES 2 -#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */ +#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */ enum pt_capabilities { PT_CAP_max_subleaf = 0, diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d726459d08e5..841a5d104afa 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -159,7 +159,7 @@ static inline void *phys_to_virt(phys_addr_t address) /* * ISA I/O bus memory addresses are 1:1 with the physical address. * However, we truncate the address to unsigned int to avoid undesirable - * promitions in legacy drivers. + * promotions in legacy drivers. */ static inline unsigned int isa_virt_to_bus(volatile void *address) { diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 9b2a0ff76c73..562854c60808 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -190,7 +190,7 @@ /* * Macro to invoke __do_softirq on the irq stack. This is only called from - * task context when bottom halfs are about to be reenabled and soft + * task context when bottom halves are about to be reenabled and soft * interrupts are pending to be processed. The interrupt stack cannot be in * use here. */ diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 144d70ea4393..c5ce9845c999 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -109,18 +109,13 @@ static __always_inline unsigned long arch_local_irq_save(void) } #else -#define ENABLE_INTERRUPTS(x) sti -#define DISABLE_INTERRUPTS(x) cli - #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(x) pushfq; popq %rax +#define SAVE_FLAGS pushfq; popq %rax #endif #define INTERRUPT_RETURN jmp native_iret -#else -#define INTERRUPT_RETURN iret #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 06c3cc22a058..610a05374c02 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -6,12 +6,6 @@ #define JUMP_LABEL_NOP_SIZE 5 -#ifdef CONFIG_X86_64 -# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC -#else -# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC -#endif - #include <asm/asm.h> #include <asm/nops.h> @@ -20,10 +14,10 @@ #include <linux/stringify.h> #include <linux/types.h> -static __always_inline bool arch_static_branch(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" - ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" + ".byte " __stringify(BYTES_NOP5) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" ".long 1b - ., %l[l_yes] - . \n\t" @@ -36,7 +30,7 @@ l_yes: return true; } -static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch) +static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" @@ -63,7 +57,7 @@ l_yes: .long \target - .Lstatic_jump_after_\@ .Lstatic_jump_after_\@: .else - .byte STATIC_KEY_INIT_NOP + .byte BYTES_NOP5 .endif .pushsection __jump_table, "aw" _ASM_ALIGN @@ -75,7 +69,7 @@ l_yes: .macro STATIC_JUMP_IF_FALSE target, key, def .Lstatic_jump_\@: .if \def - .byte STATIC_KEY_INIT_NOP + .byte BYTES_NOP5 .else /* Equivalent to "jmp.d32 \target" */ .byte 0xe9 diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 6802c59e8252..0a6e34b07017 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -150,11 +150,6 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; - - /* Core ELF header buffer */ - void *elf_headers; - unsigned long elf_headers_sz; - unsigned long elf_load_addr; }; #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h index 97bbb4a9083a..05b48b33baf0 100644 --- a/arch/x86/include/asm/kfence.h +++ b/arch/x86/include/asm/kfence.h @@ -56,8 +56,13 @@ static inline bool kfence_protect_page(unsigned long addr, bool protect) else set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - /* Flush this CPU's TLB. */ + /* + * Flush this CPU's TLB, assuming whoever did the allocation/free is + * likely to continue running on this CPU. + */ + preempt_disable(); flush_tlb_one_kernel(addr); + preempt_enable(); return true; } diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index d20a3d6be36e..bd7f5886a789 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -65,10 +65,22 @@ struct arch_specific_insn { * a post_handler). */ unsigned boostable:1; - unsigned if_modifier:1; - unsigned is_call:1; - unsigned is_pushf:1; - unsigned is_abs_ip:1; + unsigned char size; /* The size of insn */ + union { + unsigned char opcode; + struct { + unsigned char type; + } jcc; + struct { + unsigned char type; + unsigned char asize; + } loop; + struct { + unsigned char reg; + } indirect; + }; + s32 rel32; /* relative offset must be s32, s16, or s8 */ + void (*emulate_op)(struct kprobe *p, struct pt_regs *regs); /* Number of bytes of text poked */ int tp_len; }; @@ -107,7 +119,6 @@ extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); extern int kprobe_int3_handler(struct pt_regs *regs); -extern int kprobe_debug_handler(struct pt_regs *regs); #else diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3768819693e5..10eca9e8f7f6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1488,7 +1488,7 @@ extern u64 kvm_mce_cap_supported; /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context - * should be resued as is, i.e. skip initialization of + * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. @@ -1513,7 +1513,7 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. - * VMware backoor emulation handles select instructions + * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index ccf60a809a17..67ff0d637e55 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -9,70 +9,29 @@ #include <asm/hyperv-tlfs.h> #include <asm/nospec-branch.h> #include <asm/paravirt.h> +#include <asm/mshyperv.h> typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, void *data); -#define hv_init_timer(timer, tick) \ - wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick) -#define hv_init_timer_config(timer, val) \ - wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val) - -#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val) -#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val) - -#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val) -#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val) - -#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val) -#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val) - -#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index) - -#define hv_signal_eom() wrmsrl(HV_X64_MSR_EOM, 0) - -#define hv_get_synint_state(int_num, val) \ - rdmsrl(HV_X64_MSR_SINT0 + int_num, val) -#define hv_set_synint_state(int_num, val) \ - wrmsrl(HV_X64_MSR_SINT0 + int_num, val) -#define hv_recommend_using_aeoi() \ - (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)) +static inline void hv_set_register(unsigned int reg, u64 value) +{ + wrmsrl(reg, value); +} -#define hv_get_crash_ctl(val) \ - rdmsrl(HV_X64_MSR_CRASH_CTL, val) +static inline u64 hv_get_register(unsigned int reg) +{ + u64 value; -#define hv_get_time_ref_count(val) \ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val) + rdmsrl(reg, value); + return value; +} -#define hv_get_reference_tsc(val) \ - rdmsrl(HV_X64_MSR_REFERENCE_TSC, val) -#define hv_set_reference_tsc(val) \ - wrmsrl(HV_X64_MSR_REFERENCE_TSC, val) -#define hv_set_clocksource_vdso(val) \ - ((val).vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK) -#define hv_enable_vdso_clocksource() \ - vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); #define hv_get_raw_timer() rdtsc_ordered() -#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR - -/* - * Reference to pv_ops must be inline so objtool - * detection of noinstr violations can work correctly. - */ -static __always_inline void hv_setup_sched_clock(void *sched_clock) -{ -#ifdef CONFIG_PARAVIRT - pv_ops.time.sched_clock = sched_clock; -#endif -} void hyperv_vector_handler(struct pt_regs *regs); -static inline void hv_enable_stimer0_percpu_irq(int irq) {} -static inline void hv_disable_stimer0_percpu_irq(int irq) {} - - #if IS_ENABLED(CONFIG_HYPERV) extern int hyperv_init_cpuhp; @@ -189,38 +148,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) return hv_status; } -/* - * Rep hypercalls. Callers of this functions are supposed to ensure that - * rep_count and varhead_size comply with Hyper-V hypercall definition. - */ -static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, - void *input, void *output) -{ - u64 control = code; - u64 status; - u16 rep_comp; - - control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; - control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; - - do { - status = hv_do_hypercall(control, input, output); - if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) - return status; - - /* Bits 32-43 of status have 'Reps completed' data. */ - rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >> - HV_HYPERCALL_REP_COMP_OFFSET; - - control &= ~HV_HYPERCALL_REP_START_MASK; - control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET; - - touch_nmi_watchdog(); - } while (rep_comp < rep_count); - - return status; -} - extern struct hv_vp_assist_page **hv_vp_assist_page; static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) @@ -233,9 +160,6 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) void __init hyperv_init(void); void hyperv_setup_mmu_ops(void); -void *hv_alloc_hyperv_page(void); -void *hv_alloc_hyperv_zeroed_page(void); -void hv_free_hyperv_page(unsigned long addr); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); @@ -272,8 +196,6 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} -static inline void *hv_alloc_hyperv_page(void) { return NULL; } -static inline void hv_free_hyperv_page(unsigned long addr) {} static inline void set_hv_tscchange_cb(void (*cb)(void)) {} static inline void clear_hv_tscchange_cb(void) {} static inline void hyperv_stop_tsc_emulation(void) {}; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 546d6ecf0a35..742d89a00721 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -185,6 +185,9 @@ #define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define PERF_CAP_METRICS_IDX 15 +#define PERF_CAP_PT_IDX 16 + #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_IA32_RTIT_CTL 0x00000570 @@ -265,6 +268,7 @@ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ #define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2) #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) #define DEBUGCTLMSR_BTINT (1UL << 8) @@ -628,8 +632,6 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) -#define MSR_IA32_TSCDEADLINE 0x000006e0 - #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index 12f12b5cf2ca..c1e5e818ba16 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -4,89 +4,58 @@ /* * Define nops for use with alternative() and for tracing. - * - * *_NOP5_ATOMIC must be a single instruction. */ -#define NOP_DS_PREFIX 0x3e +#ifndef CONFIG_64BIT -/* generic versions from gas - 1: nop - the following instructions are NOT nops in 64-bit mode, - for 64-bit mode use K8 or P6 nops instead - 2: movl %esi,%esi - 3: leal 0x00(%esi),%esi - 4: leal 0x00(,%esi,1),%esi - 6: leal 0x00000000(%esi),%esi - 7: leal 0x00000000(,%esi,1),%esi -*/ -#define GENERIC_NOP1 0x90 -#define GENERIC_NOP2 0x89,0xf6 -#define GENERIC_NOP3 0x8d,0x76,0x00 -#define GENERIC_NOP4 0x8d,0x74,0x26,0x00 -#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4 -#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 -#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 -#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7 -#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4 +/* + * Generic 32bit nops from GAS: + * + * 1: nop + * 2: movl %esi,%esi + * 3: leal 0x0(%esi),%esi + * 4: leal 0x0(%esi,%eiz,1),%esi + * 5: leal %ds:0x0(%esi,%eiz,1),%esi + * 6: leal 0x0(%esi),%esi + * 7: leal 0x0(%esi,%eiz,1),%esi + * 8: leal %ds:0x0(%esi,%eiz,1),%esi + * + * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2 + * nop instructions. + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x89,0xf6 +#define BYTES_NOP3 0x8d,0x76,0x00 +#define BYTES_NOP4 0x8d,0x74,0x26,0x00 +#define BYTES_NOP5 0x3e,BYTES_NOP4 +#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00 +#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x3e,BYTES_NOP7 -/* Opteron 64bit nops - 1: nop - 2: osp nop - 3: osp osp nop - 4: osp osp osp nop -*/ -#define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 0x66,K8_NOP1 -#define K8_NOP3 0x66,K8_NOP2 -#define K8_NOP4 0x66,K8_NOP3 -#define K8_NOP5 K8_NOP3,K8_NOP2 -#define K8_NOP6 K8_NOP3,K8_NOP3 -#define K8_NOP7 K8_NOP4,K8_NOP3 -#define K8_NOP8 K8_NOP4,K8_NOP4 -#define K8_NOP5_ATOMIC 0x66,K8_NOP4 +#else -/* K7 nops - uses eax dependencies (arbitrary choice) - 1: nop - 2: movl %eax,%eax - 3: leal (,%eax,1),%eax - 4: leal 0x00(,%eax,1),%eax - 6: leal 0x00000000(%eax),%eax - 7: leal 0x00000000(,%eax,1),%eax -*/ -#define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 0x8b,0xc0 -#define K7_NOP3 0x8d,0x04,0x20 -#define K7_NOP4 0x8d,0x44,0x20,0x00 -#define K7_NOP5 K7_NOP4,K7_NOP1 -#define K7_NOP6 0x8d,0x80,0,0,0,0 -#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0 -#define K7_NOP8 K7_NOP7,K7_NOP1 -#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4 +/* + * Generic 64bit nops from GAS: + * + * 1: nop + * 2: osp nop + * 3: nopl (%eax) + * 4: nopl 0x00(%eax) + * 5: nopl 0x00(%eax,%eax,1) + * 6: osp nopl 0x00(%eax,%eax,1) + * 7: nopl 0x00000000(%eax) + * 8: nopl 0x00000000(%eax,%eax,1) + */ +#define BYTES_NOP1 0x90 +#define BYTES_NOP2 0x66,BYTES_NOP1 +#define BYTES_NOP3 0x0f,0x1f,0x00 +#define BYTES_NOP4 0x0f,0x1f,0x40,0x00 +#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00 +#define BYTES_NOP6 0x66,BYTES_NOP5 +#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00 +#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -/* P6 nops - uses eax dependencies (Intel-recommended choice) - 1: nop - 2: osp nop - 3: nopl (%eax) - 4: nopl 0x00(%eax) - 5: nopl 0x00(%eax,%eax,1) - 6: osp nopl 0x00(%eax,%eax,1) - 7: nopl 0x00000000(%eax) - 8: nopl 0x00000000(%eax,%eax,1) - Note: All the above are assumed to be a single instruction. - There is kernel code that depends on this. -*/ -#define P6_NOP1 GENERIC_NOP1 -#define P6_NOP2 0x66,0x90 -#define P6_NOP3 0x0f,0x1f,0x00 -#define P6_NOP4 0x0f,0x1f,0x40,0 -#define P6_NOP5 0x0f,0x1f,0x44,0x00,0 -#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0 -#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0 -#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0 -#define P6_NOP5_ATOMIC P6_NOP5 +#endif /* CONFIG_64BIT */ #ifdef __ASSEMBLY__ #define _ASM_MK_NOP(x) .byte x @@ -94,54 +63,19 @@ #define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" #endif -#if defined(CONFIG_MK7) -#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8) -#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC) -#elif defined(CONFIG_X86_P6_NOP) -#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8) -#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC) -#elif defined(CONFIG_X86_64) -#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8) -#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC) -#else -#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8) -#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC) -#endif +#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) +#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) +#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) +#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) +#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) +#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) +#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) +#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) #define ASM_NOP_MAX 8 -#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */ #ifndef __ASSEMBLY__ -extern const unsigned char * const *ideal_nops; -extern void arch_init_ideal_nops(void); +extern const unsigned char * const x86_nops[]; #endif #endif /* _ASM_X86_NOPS_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index cb9ad6b73973..3ad8c6d3cbb3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -7,7 +7,6 @@ #include <linux/objtool.h> #include <asm/alternative.h> -#include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> #include <asm/unwind_hints.h> @@ -33,7 +32,7 @@ /* * Google experimented with loop-unrolling and this turned out to be - * the optimal version — two calls, each with their own speculation + * the optimal version - two calls, each with their own speculation * trap should their return address end up getting used, in a loop. */ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ @@ -81,7 +80,7 @@ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ - __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else jmp *%\reg @@ -91,7 +90,7 @@ .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ - __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else call *%\reg @@ -129,7 +128,7 @@ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_retpoline_%V[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4abf110e2243..43992e5c52c2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -15,11 +15,20 @@ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> +#include <linux/static_call_types.h> #include <asm/frame.h> -static inline unsigned long long paravirt_sched_clock(void) +u64 dummy_steal_clock(int cpu); +u64 dummy_sched_clock(void); + +DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); +DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)); + +static inline u64 paravirt_sched_clock(void) { - return PVOP_CALL0(unsigned long long, time.sched_clock); + return static_call(pv_sched_clock)(); } struct static_key; @@ -33,9 +42,13 @@ bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { - return PVOP_CALL1(u64, time.steal_clock, cpu); + return static_call(pv_steal_clock)(cpu); } +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init paravirt_set_cap(void); +#endif + /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { @@ -122,7 +135,9 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALLEE0(unsigned long, mmu.read_cr2); + return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, + "mov %%cr2, %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr2(unsigned long x) @@ -132,12 +147,14 @@ static inline void write_cr2(unsigned long x) static inline unsigned long __read_cr3(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr3); + return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, + "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV)); } static inline void write_cr3(unsigned long x) { - PVOP_VCALL1(mmu.write_cr3, x); + PVOP_ALT_VCALL1(mmu.write_cr3, x, + "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __write_cr4(unsigned long x) @@ -157,7 +174,7 @@ static inline void halt(void) static inline void wbinvd(void) { - PVOP_VCALL0(cpu.wbinvd); + PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV)); } static inline u64 paravirt_read_msr(unsigned msr) @@ -371,22 +388,28 @@ static inline void paravirt_release_p4d(unsigned long pfn) static inline pte_t __pte(pteval_t val) { - return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) }; + return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pteval_t pte_val(pte_t pte) { - return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); + return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline pgd_t __pgd(pgdval_t val) { - return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) }; + return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pgdval_t pgd_val(pgd_t pgd) { - return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); + return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -419,12 +442,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline pmd_t __pmd(pmdval_t val) { - return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) }; + return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)) }; } static inline pmdval_t pmd_val(pmd_t pmd) { - return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); + return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void set_pud(pud_t *pudp, pud_t pud) @@ -436,14 +462,16 @@ static inline pud_t __pud(pudval_t val) { pudval_t ret; - ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val); + ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { - return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); + return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void pud_clear(pud_t *pudp) @@ -462,14 +490,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) static inline p4d_t __p4d(p4dval_t val) { - p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val); + p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, + "mov %%rdi, %%rax", + ALT_NOT(X86_FEATURE_XENPV)); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { - return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d); + return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, + "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV)); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) @@ -556,7 +587,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { - PVOP_VCALLEE1(lock.queued_spin_unlock, lock); + PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, + "movb $0, (%%" _ASM_ARG1 ");", + ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) @@ -571,7 +604,9 @@ static __always_inline void pv_kick(int cpu) static __always_inline bool pv_vcpu_is_preempted(long cpu) { - return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu); + return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, + "xor %%" _ASM_AX ", %%" _ASM_AX ";", + ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); @@ -645,17 +680,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #ifdef CONFIG_PARAVIRT_XXL static inline notrace unsigned long arch_local_save_flags(void) { - return PVOP_CALLEE0(unsigned long, irq.save_fl); + return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", + ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_disable(void) { - PVOP_VCALLEE0(irq.irq_disable); + PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace void arch_local_irq_enable(void) { - PVOP_VCALLEE0(irq.irq_enable); + PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV)); } static inline notrace unsigned long arch_local_irq_save(void) @@ -700,84 +736,27 @@ extern void default_banner(void); .popsection -#define COND_PUSH(set, mask, reg) \ - .if ((~(set)) & mask); push %reg; .endif -#define COND_POP(set, mask, reg) \ - .if ((~(set)) & mask); pop %reg; .endif - #ifdef CONFIG_X86_64 - -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_RAX, rax); \ - COND_PUSH(set, CLBR_RCX, rcx); \ - COND_PUSH(set, CLBR_RDX, rdx); \ - COND_PUSH(set, CLBR_RSI, rsi); \ - COND_PUSH(set, CLBR_RDI, rdi); \ - COND_PUSH(set, CLBR_R8, r8); \ - COND_PUSH(set, CLBR_R9, r9); \ - COND_PUSH(set, CLBR_R10, r10); \ - COND_PUSH(set, CLBR_R11, r11) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_R11, r11); \ - COND_POP(set, CLBR_R10, r10); \ - COND_POP(set, CLBR_R9, r9); \ - COND_POP(set, CLBR_R8, r8); \ - COND_POP(set, CLBR_RDI, rdi); \ - COND_POP(set, CLBR_RSI, rsi); \ - COND_POP(set, CLBR_RDX, rdx); \ - COND_POP(set, CLBR_RCX, rcx); \ - COND_POP(set, CLBR_RAX, rax) +#ifdef CONFIG_PARAVIRT_XXL #define PARA_PATCH(off) ((off) / 8) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) -#else -#define PV_SAVE_REGS(set) \ - COND_PUSH(set, CLBR_EAX, eax); \ - COND_PUSH(set, CLBR_EDI, edi); \ - COND_PUSH(set, CLBR_ECX, ecx); \ - COND_PUSH(set, CLBR_EDX, edx) -#define PV_RESTORE_REGS(set) \ - COND_POP(set, CLBR_EDX, edx); \ - COND_POP(set, CLBR_ECX, ecx); \ - COND_POP(set, CLBR_EDI, edi); \ - COND_POP(set, CLBR_EAX, eax) - -#define PARA_PATCH(off) ((off) / 4) -#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4) -#define PARA_INDIRECT(addr) *%cs:addr -#endif -#ifdef CONFIG_PARAVIRT_XXL #define INTERRUPT_RETURN \ - PARA_SITE(PARA_PATCH(PV_CPU_iret), \ - ANNOTATE_RETPOLINE_SAFE; \ - jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);) - -#define DISABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) - -#define ENABLE_INTERRUPTS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -#endif + ANNOTATE_RETPOLINE_SAFE; \ + ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \ + X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;") -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY -#define SAVE_FLAGS(clobbers) \ - PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ - PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ - PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +.macro PARA_IRQ_save_fl + PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), + ANNOTATE_RETPOLINE_SAFE; + call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);) +.endm + +#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \ + ALT_NOT(X86_FEATURE_XENPV) #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ @@ -800,5 +779,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif + +#ifndef CONFIG_PARAVIRT_SPINLOCKS +static inline void paravirt_set_cap(void) +{ +} +#endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index de87087d3bde..ae692c3194e9 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -3,7 +3,6 @@ #define _ASM_X86_PARAVIRT_TYPES_H /* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 #define CLBR_EAX (1 << 0) #define CLBR_ECX (1 << 1) #define CLBR_EDX (1 << 2) @@ -15,7 +14,6 @@ #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) #define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX #define CLBR_RCX CLBR_ECX @@ -32,12 +30,9 @@ #define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ CLBR_RCX | CLBR_R8 | CLBR_R9) #define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) #endif /* X86_64 */ -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) - #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> @@ -73,19 +68,6 @@ struct pv_info { const char *name; }; -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, void *insn_buff, - unsigned long addr, unsigned len); -} __no_randomize_layout; - #ifdef CONFIG_PARAVIRT_XXL struct pv_lazy_ops { /* Set deferred update mode, used for batching operations. */ @@ -95,11 +77,6 @@ struct pv_lazy_ops { } __no_randomize_layout; #endif -struct pv_time_ops { - unsigned long long (*sched_clock)(void); - unsigned long long (*steal_clock)(int cpu); -} __no_randomize_layout; - struct pv_cpu_ops { /* hooks for various privileged instructions */ void (*io_delay)(void); @@ -156,10 +133,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - void (*start_context_switch)(struct task_struct *prev); void (*end_context_switch)(struct task_struct *next); #endif @@ -290,8 +263,6 @@ struct pv_lock_ops { * number for each function using the offset which we use to indicate * what to patch. */ struct paravirt_patch_template { - struct pv_init_ops init; - struct pv_time_ops time; struct pv_cpu_ops cpu; struct pv_irq_ops irq; struct pv_mmu_ops mmu; @@ -300,6 +271,7 @@ struct paravirt_patch_template { extern struct pv_info pv_info; extern struct paravirt_patch_template pv_ops; +extern void (*paravirt_iret)(void); #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) @@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops; /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" -unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len); -unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len); -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end); - -unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len); +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len); int paravirt_disable_iospace(void); @@ -371,7 +339,7 @@ int paravirt_disable_iospace(void); * on the stack. All caller-save registers (eax,edx,ecx) are expected * to be modified (either clobbered or used for return values). * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * conventions, returning at %rax, with parameters going on %rdi, %rsi, * %rdx, and %rcx. Note that for this reason, x86_64 does not need any * special handling for dealing with 4 arguments, unlike i386. * However, x86_64 also have to clobber all caller saved registers, which @@ -414,11 +382,9 @@ int paravirt_disable_iospace(void); * makes sure the incoming and outgoing types are always correct. */ #ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) @@ -434,12 +400,10 @@ int paravirt_disable_iospace(void); #define VEXTRA_CLOBBERS #else /* CONFIG_X86_64 */ /* [re]ax isn't an arg, but the return val */ -#define PVOP_VCALL_ARGS \ +#define PVOP_CALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ __edx = __edx, __ecx = __ecx, __eax = __eax; -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) #define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) #define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) @@ -464,152 +428,138 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)pv_ops.op) #endif -#define PVOP_RETMASK(rettype) \ +#define PVOP_RETVAL(rettype) \ ({ unsigned long __mask = ~0UL; \ + BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \ switch (sizeof(rettype)) { \ case 1: __mask = 0xffUL; break; \ case 2: __mask = 0xffffUL; break; \ case 4: __mask = 0xffffffffUL; break; \ default: break; \ } \ - __mask; \ + __mask & __eax; \ }) -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ +#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \ ({ \ - rettype __ret; \ PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr, ASM_CALL_CONSTRAINT \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ - } \ - __ret; \ + asm volatile(paravirt_alt(PARAVIRT_CALL) \ + : call_clbr, ASM_CALL_CONSTRAINT \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ +#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \ + extra_clbr, ...) \ ({ \ - PVOP_VCALL_ARGS; \ + PVOP_CALL_ARGS; \ PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ + asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \ + alt, cond) \ : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ + ret; \ }) -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) +#define __PVOP_CALL(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\ + PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \ + ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, ...) \ + ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + +#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \ + ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \ + CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__) + + +#define __PVOP_VCALL(op, ...) \ + (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, ##__VA_ARGS__) + +#define __PVOP_ALT_VCALL(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \ + PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \ + ##__VA_ARGS__) -#define __PVOP_VCALLEESAVE(op, pre, post, ...) \ - ____PVOP_VCALL(op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) +#define __PVOP_VCALLEESAVE(op, ...) \ + (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) +#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \ + (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__) #define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") + __PVOP_CALL(rettype, op) #define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") + __PVOP_VCALL(op) +#define PVOP_ALT_CALL0(rettype, op, alt, cond) \ + __PVOP_ALT_CALL(rettype, op, alt, cond) +#define PVOP_ALT_VCALL0(op, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond) #define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") + __PVOP_CALLEESAVE(rettype, op) #define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") + __PVOP_VCALLEESAVE(op) +#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond) +#define PVOP_ALT_VCALLEE0(op, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond) #define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1)) #define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \ + __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1)) +#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \ + __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1)) #define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2)) #define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) #define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \ PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 #define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) #define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif /* Lazy mode for batching updates / context switch */ enum paravirt_lazy_mode { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a02c67291cfc..b1099f2d9800 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1244,7 +1244,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * - * dst - pointer to pgd range anwhere on a pgd page + * dst - pointer to pgd range anywhere on a pgd page * src - "" * count - the number of pgds to copy. * diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f1b9ed5efaa9..154321d29050 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -314,11 +314,6 @@ struct x86_hw_tss { struct x86_hw_tss { u32 reserved1; u64 sp0; - - /* - * We store cpu_current_top_of_stack in sp1 so it's always accessible. - * Linux does not use ring 1, so sp1 is not otherwise needed. - */ u64 sp1; /* @@ -426,12 +421,7 @@ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); -#ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); -#else -/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ -#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 -#endif #ifdef CONFIG_X86_64 struct fixed_percpu_data { @@ -439,6 +429,9 @@ struct fixed_percpu_data { * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. + * + * Once we are willing to require -mstack-protector-guard-symbol= + * support for x86_64 stackprotector, we can get rid of this. */ char gs_base[40]; unsigned long stack_canary; @@ -460,17 +453,7 @@ extern asmlinkage void ignore_sysret(void); void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR -/* - * Make sure stack canary segment base is cached-aligned: - * "For Intel Atom processors, avoid non zero segment base address - * that is not aligned to cache line boundary at all cost." - * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) - */ -struct stack_canary { - char __pad[20]; /* canary at %gs:20 */ - unsigned long canary; -}; -DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +DECLARE_PER_CPU(unsigned long, __stack_chk_guard); #endif DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); @@ -527,7 +510,7 @@ struct thread_struct { struct io_bitmap *io_bitmap; /* - * IOPL. Priviledge level dependent I/O permission which is + * IOPL. Privilege level dependent I/O permission which is * emulated via the I/O bitmap to prevent user space from disabling * interrupts. */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index b6a9d51d1d79..8c5d1910a848 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -4,6 +4,8 @@ #include <asm/ldt.h> +struct task_struct; + /* misc architecture specific prototypes */ void syscall_init(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 409f661481e1..b94f615600d5 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -37,7 +37,10 @@ struct pt_regs { unsigned short __esh; unsigned short fs; unsigned short __fsh; - /* On interrupt, gs and __gsh store the vector number. */ + /* + * On interrupt, gs and __gsh store the vector number. They never + * store gs any more. + */ unsigned short gs; unsigned short __gsh; /* On interrupt, this is the error code. */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 7fdd4facfce7..72044026eb3c 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -95,7 +95,7 @@ * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 + * 28 - unused * 29 - unused * 30 - unused * 31 - TSS for double fault handler @@ -118,7 +118,6 @@ #define GDT_ENTRY_ESPFIX_SS 26 #define GDT_ENTRY_PERCPU 27 -#define GDT_ENTRY_STACK_CANARY 28 #define GDT_ENTRY_DOUBLEFAULT_TSS 31 @@ -158,12 +157,6 @@ # define __KERNEL_PERCPU 0 #endif -#ifdef CONFIG_STACKPROTECTOR -# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) -#else -# define __KERNEL_STACK_CANARY 0 -#endif - #else /* 64-bit: */ #include <asm/cache.h> @@ -364,22 +357,15 @@ static inline void __loadsegment_fs(unsigned short value) asm("mov %%" #seg ",%0":"=r" (value) : : "memory") /* - * x86-32 user GS accessors: + * x86-32 user GS accessors. This is ugly and could do with some cleaning up. */ #ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_32_LAZY_GS -# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) -# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -# define task_user_gs(tsk) ((tsk)->thread.gs) -# define lazy_save_gs(v) savesegment(gs, (v)) -# define lazy_load_gs(v) loadsegment(gs, (v)) -# else /* X86_32_LAZY_GS */ -# define get_user_gs(regs) (u16)((regs)->gs) -# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -# define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -# define lazy_save_gs(v) do { } while (0) -# define lazy_load_gs(v) do { } while (0) -# endif /* X86_32_LAZY_GS */ +# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) +# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +# define task_user_gs(tsk) ((tsk)->thread.gs) +# define lazy_save_gs(v) savesegment(gs, (v)) +# define lazy_load_gs(v) loadsegment(gs, (v)) +# define load_gs_index(v) loadsegment(gs, (v)) #endif /* X86_32 */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4352f08bfbb5..43fa081a1adb 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -8,8 +8,8 @@ /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: - * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack - * Executability : eXeutable, NoteXecutable + * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack + * Executability : eXecutable, NoteXecutable * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent * Encryption : Encrypted, Decrypted diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 389d851a02c4..a12458a7a8d4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -130,11 +130,6 @@ void *extend_brk(size_t size, size_t align); : : "i" (sz)); \ } -/* Helper for reserving space for arrays of things */ -#define RESERVE_BRK_ARRAY(type, name, entries) \ - type *name; \ - RESERVE_BRK(name, sizeof(type) * entries) - extern void probe_roms(void); #ifdef __i386__ diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h index dd7602c44c72..9c31e0ebc55b 100644 --- a/arch/x86/kernel/cpu/sgx/arch.h +++ b/arch/x86/include/asm/sgx.h @@ -2,15 +2,20 @@ /** * Copyright(c) 2016-20 Intel Corporation. * - * Contains data structures defined by the SGX architecture. Data structures - * defined by the Linux software stack should not be placed here. + * Intel Software Guard Extensions (SGX) support. */ -#ifndef _ASM_X86_SGX_ARCH_H -#define _ASM_X86_SGX_ARCH_H +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H #include <linux/bits.h> #include <linux/types.h> +/* + * This file contains both data structures defined by SGX architecture and Linux + * defined software data structures and functions. The two should not be mixed + * together for better readibility. The architectural definitions come first. + */ + /* The SGX specific CPUID function. */ #define SGX_CPUID 0x12 /* EPC enumeration. */ @@ -22,16 +27,36 @@ /* The bitmask for the EPC section type. */ #define SGX_CPUID_EPC_MASK GENMASK(3, 0) +enum sgx_encls_function { + ECREATE = 0x00, + EADD = 0x01, + EINIT = 0x02, + EREMOVE = 0x03, + EDGBRD = 0x04, + EDGBWR = 0x05, + EEXTEND = 0x06, + ELDU = 0x08, + EBLOCK = 0x09, + EPA = 0x0A, + EWB = 0x0B, + ETRACK = 0x0C, + EAUG = 0x0D, + EMODPR = 0x0E, + EMODT = 0x0F, +}; + /** * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not * been completed yet. + * %SGX_CHILD_PRESENT SECS has child pages present in the EPC. * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's * public key does not match IA32_SGXLEPUBKEYHASH. * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received */ enum sgx_return_code { SGX_NOT_TRACKED = 11, + SGX_CHILD_PRESENT = 13, SGX_INVALID_EINITTOKEN = 16, SGX_UNMASKED_EVENT = 128, }; @@ -271,7 +296,7 @@ struct sgx_pcmd { * @header1: constant byte string * @vendor: must be either 0x0000 or 0x8086 * @date: YYYYMMDD in BCD - * @header2: costant byte string + * @header2: constant byte string * @swdefined: software defined value */ struct sgx_sigstruct_header { @@ -335,4 +360,19 @@ struct sgx_sigstruct { #define SGX_LAUNCH_TOKEN_SIZE 304 -#endif /* _ASM_X86_SGX_ARCH_H */ +/* + * Do not put any hardware-defined SGX structure representations below this + * comment! + */ + +#ifdef CONFIG_X86_SGX_KVM +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr); +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd); + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 0bc9b0895f33..d17b39893b79 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -11,6 +11,7 @@ #include <asm/nops.h> #include <asm/cpufeatures.h> +#include <asm/alternative.h> /* "Raw" instruction opcodes */ #define __ASM_CLAC ".byte 0x0f,0x01,0xca" @@ -18,8 +19,6 @@ #ifdef __ASSEMBLY__ -#include <asm/alternative-asm.h> - #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ @@ -37,8 +36,6 @@ #else /* __ASSEMBLY__ */ -#include <asm/alternative.h> - #ifdef CONFIG_X86_SMAP static __always_inline void clac(void) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c0538f82c9a2..630ff08532be 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -132,6 +132,7 @@ void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); +void cond_wakeup_cpu0(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 1d3cbaef4bb7..2acd6cb62328 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -214,7 +214,7 @@ static inline void clflush(volatile void *__p) static inline void clflushopt(volatile void *__p) { - alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0", + alternative_io(".byte 0x3e; clflush %P0", ".byte 0x66; clflush %P0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); @@ -225,7 +225,7 @@ static inline void clwb(volatile void *__p) volatile struct { char x[64]; } *p = __p; asm volatile(ALTERNATIVE_2( - ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", + ".byte 0x3e; clflush (%[pax])", ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ X86_FEATURE_CLFLUSHOPT, ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 7fb482f0f25b..b6ffe58c70fa 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -5,30 +5,23 @@ * Stack protector works by putting predefined pattern at the start of * the stack frame and verifying that it hasn't been overwritten when * returning from the function. The pattern is called stack canary - * and unfortunately gcc requires it to be at a fixed offset from %gs. - * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64 - * and x86_32 use segment registers differently and thus handles this - * requirement differently. + * and unfortunately gcc historically required it to be at a fixed offset + * from the percpu segment base. On x86_64, the offset is 40 bytes. * - * On x86_64, %gs is shared by percpu area and stack canary. All - * percpu symbols are zero based and %gs points to the base of percpu - * area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at offset 40. Userland - * %gs is always saved and restored on kernel entry and exit using - * swapgs, so stack protector doesn't add any complexity there. + * The same segment is shared by percpu area and stack canary. On + * x86_64, percpu symbols are zero based and %gs (64-bit) points to the + * base of percpu area. The first occupant of the percpu area is always + * fixed_percpu_data which contains stack_canary at the approproate + * offset. On x86_32, the stack canary is just a regular percpu + * variable. * - * On x86_32, it's slightly more complicated. As in x86_64, %gs is - * used for userland TLS. Unfortunately, some processors are much - * slower at loading segment registers with different value when - * entering and leaving the kernel, so the kernel uses %fs for percpu - * area and manages %gs lazily so that %gs is switched only when - * necessary, usually during task switch. + * Putting percpu data in %fs on 32-bit is a minor optimization compared to + * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely + * to load 0 into %fs on exit to usermode, whereas with percpu data in + * %gs, we are likely to load a non-null %gs on return to user mode. * - * As gcc requires the stack canary at %gs:20, %gs can't be managed - * lazily if stack protector is enabled, so the kernel saves and - * restores userland %gs on kernel entry and exit. This behavior is - * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in - * system.h to hide the details. + * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector + * support, we can remove some of this complexity. */ #ifndef _ASM_STACKPROTECTOR_H @@ -45,14 +38,6 @@ #include <linux/sched.h> /* - * 24 byte read-only segment initializer for stack canary. Linker - * can't handle the address bit shifting. Address will be set in - * head_32 for boot CPU and setup_per_cpu_areas() for others. - */ -#define GDT_STACK_CANARY_INIT \ - [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), - -/* * Initialize the stackprotector canary value. * * NOTE: this must only be called from functions that never return @@ -86,7 +71,7 @@ static __always_inline void boot_init_stack_canary(void) #ifdef CONFIG_X86_64 this_cpu_write(fixed_percpu_data.stack_canary, canary); #else - this_cpu_write(stack_canary.canary, canary); + this_cpu_write(__stack_chk_guard, canary); #endif } @@ -95,48 +80,16 @@ static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) #ifdef CONFIG_X86_64 per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; #else - per_cpu(stack_canary.canary, cpu) = idle->stack_canary; -#endif -} - -static inline void setup_stack_canary_segment(int cpu) -{ -#ifdef CONFIG_X86_32 - unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); - struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu); - struct desc_struct desc; - - desc = gdt_table[GDT_ENTRY_STACK_CANARY]; - set_desc_base(&desc, canary); - write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); -#endif -} - -static inline void load_stack_canary_segment(void) -{ -#ifdef CONFIG_X86_32 - asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory"); + per_cpu(__stack_chk_guard, cpu) = idle->stack_canary; #endif } #else /* STACKPROTECTOR */ -#define GDT_STACK_CANARY_INIT - /* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */ -static inline void setup_stack_canary_segment(int cpu) -{ } - static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) { } -static inline void load_stack_canary_segment(void) -{ -#ifdef CONFIG_X86_32 - asm volatile ("mov %0, %%gs" : : "r" (0)); -#endif -} - #endif /* STACKPROTECTOR */ #endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index fdbd9d7b7bca..7b132d0312eb 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -13,12 +13,10 @@ /* image of the saved processor state */ struct saved_context { /* - * On x86_32, all segment registers, with the possible exception of - * gs, are saved at kernel entry in pt_regs. + * On x86_32, all segment registers except gs are saved at kernel + * entry in pt_regs. */ -#ifdef CONFIG_X86_32_LAZY_GS u16 gs; -#endif unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 9f69cc497f4b..b5f0d2ff47e4 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task) else this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else - /* - * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That - * doesn't work on x86-32 because sp1 and - * cpu_current_top_of_stack have different values (because of - * the non-zero stack-padding on 32bit). - */ + /* Xen PV enters the kernel on the thread stack. */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index a84333adeef2..80c08c7d5e72 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -80,6 +80,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); } #define __COND_SYSCALL(abi, name) \ + __weak long __##abi##_##name(const struct pt_regs *__unused); \ __weak long __##abi##_##name(const struct pt_regs *__unused) \ { \ return sys_ni_syscall(); \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 06b740bae431..de406d93b515 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * const stack, #endif } -#else /* !__ASSEMBLY__ */ - -#ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) -#endif - -#endif +#endif /* !__ASSEMBLY__ */ /* * Thread-synchronous status. diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h index f241451035fb..027a9258dbca 100644 --- a/arch/x86/include/asm/uv/uv_geo.h +++ b/arch/x86/include/asm/uv/uv_geo.h @@ -10,7 +10,7 @@ #ifndef _ASM_UV_GEO_H #define _ASM_UV_GEO_H -/* Type declaractions */ +/* Type declarations */ /* Size of a geoid_s structure (must be before decl. of geoid_u) */ #define GEOID_SIZE 8 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 5002f52be332..d3e3197917be 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -353,7 +353,7 @@ union uvh_apicid { * * Note there are NO leds on a UV system. This register is only * used by the system controller to monitor system-wide operation. - * There are 64 regs per node. With Nahelem cpus (2 cores per node, + * There are 64 regs per node. With Nehalem cpus (2 cores per node, * 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on * a node. * diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 7068e4bb057d..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -87,18 +87,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, #endif /* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define XEN_EXTRA_MEM_RATIO (10) - -/* * Helper functions to write or read unsigned long values to/from * memory, when the access may fault. */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 600a141c8805..b25d3f82c2f3 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -234,7 +234,7 @@ struct boot_params { * handling of page tables. * * These enums should only ever be used by x86 code, and the code that uses - * it should be well contained and compartamentalized. + * it should be well contained and compartmentalized. * * KVM and Xen HVM do not have a subarch as these are expected to follow * standard x86 boot entries. If there is a genuine need for "hypervisor" type @@ -252,7 +252,7 @@ struct boot_params { * @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path, * which start at asm startup_xen() entry point and later jump to the C * xen_start_kernel() entry point. Both domU and dom0 type of guests are - * currently supportd through this PV boot path. + * currently supported through this PV boot path. * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform * systems which do not have the PCI legacy interfaces. * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h index d95d080b30e3..0007ba077c0c 100644 --- a/arch/x86/include/uapi/asm/debugreg.h +++ b/arch/x86/include/uapi/asm/debugreg.h @@ -24,6 +24,7 @@ #define DR_TRAP3 (0x8) /* db3 */ #define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3) +#define DR_BUS_LOCK (0x800) /* bus_lock */ #define DR_STEP (0x4000) /* single-step */ #define DR_SWITCH (0x8000) /* task switch */ diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h index b3d0664fadc9..ac83e25bbf37 100644 --- a/arch/x86/include/uapi/asm/msgbuf.h +++ b/arch/x86/include/uapi/asm/msgbuf.h @@ -12,7 +12,7 @@ * The msqid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h index 9034f3007c4e..9690d6899ad9 100644 --- a/arch/x86/include/uapi/asm/sgx.h +++ b/arch/x86/include/uapi/asm/sgx.h @@ -152,7 +152,7 @@ struct sgx_enclave_run { * Most exceptions reported on ENCLU, including those that occur within the * enclave, are fixed up and reported synchronously instead of being delivered * via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are - * never fixed up and are always delivered via standard signals. On synchrously + * never fixed up and are always delivered via standard signals. On synchronously * reported exceptions, -EFAULT is returned and details about the exception are * recorded in @run.exception, the optional sgx_enclave_exception struct. * diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h index f0305dc660c9..fce18eaa070c 100644 --- a/arch/x86/include/uapi/asm/shmbuf.h +++ b/arch/x86/include/uapi/asm/shmbuf.h @@ -9,7 +9,7 @@ * The shmid64_ds structure for x86 architecture with x32 ABI. * * On x86-32 and x86-64 we can just use the generic definition, but - * x32 uses the same binary layout as x86_64, which is differnet + * x32 uses the same binary layout as x86_64, which is different * from other 32-bit architectures. */ diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 844d60eb1882..d0d9b331d3a1 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -139,7 +139,7 @@ struct _fpstate_32 { * The 64-bit FPU frame. (FXSAVE format and later) * * Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is - * larger: 'struct _xstate'. Note that 'struct _xstate' embedds + * larger: 'struct _xstate'. Note that 'struct _xstate' embeds * 'struct _fpstate' so that you can always assume the _fpstate portion * exists so that you can check the magic value. * diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2ddf08351f0b..0704c2a94272 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o +obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7bdc0239a943..e90310cbe73a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -830,7 +830,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) EXPORT_SYMBOL(acpi_unregister_ioapic); /** - * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base * has been registered * @handle: ACPI handle of the IOAPIC device * @gsi_base: GSI base associated with the IOAPIC @@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void) } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present @@ -1657,7 +1656,7 @@ static int __init parse_acpi(char *arg) else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); } - /* "acpi=copy_dsdt" copys DSDT */ + /* "acpi=copy_dsdt" copies DSDT */ else if (strcmp(arg, "copy_dsdt") == 0) { acpi_gbl_copy_dsdt_locally = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index cc1fea76aab0..3f85fcae450c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -41,7 +41,7 @@ unsigned long acpi_get_wakeup_address(void) * x86_acpi_enter_sleep_state - enter sleep state * @state: Sleep state to enter. * - * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + * Wrapper around acpi_enter_sleep_state() to be called by assembly. */ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state) { diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 56b6865afb2a..d5d8a352eafa 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) /* * The suspend path may have poisoned some areas deeper in the stack, * which we now need to unpoison. diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8d778e46725d..84ec0ba491e4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -28,6 +28,7 @@ #include <asm/insn.h> #include <asm/io.h> #include <asm/fixmap.h> +#include <asm/paravirt.h> int __read_mostly alternatives_patched; @@ -74,186 +75,30 @@ do { \ } \ } while (0) -/* - * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes - * that correspond to that nop. Getting from one nop to the next, we - * add to the array the offset that is equal to the sum of all sizes of - * nops preceding the one we are after. - * - * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the - * nice symmetry of sizes of the previous nops. - */ -#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char intelnops[] = -{ - GENERIC_NOP1, - GENERIC_NOP2, - GENERIC_NOP3, - GENERIC_NOP4, - GENERIC_NOP5, - GENERIC_NOP6, - GENERIC_NOP7, - GENERIC_NOP8, - GENERIC_NOP5_ATOMIC -}; -static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +const unsigned char x86nops[] = { - NULL, - intelnops, - intelnops + 1, - intelnops + 1 + 2, - intelnops + 1 + 2 + 3, - intelnops + 1 + 2 + 3 + 4, - intelnops + 1 + 2 + 3 + 4 + 5, - intelnops + 1 + 2 + 3 + 4 + 5 + 6, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + BYTES_NOP1, + BYTES_NOP2, + BYTES_NOP3, + BYTES_NOP4, + BYTES_NOP5, + BYTES_NOP6, + BYTES_NOP7, + BYTES_NOP8, }; -#endif -#ifdef K8_NOP1 -static const unsigned char k8nops[] = -{ - K8_NOP1, - K8_NOP2, - K8_NOP3, - K8_NOP4, - K8_NOP5, - K8_NOP6, - K8_NOP7, - K8_NOP8, - K8_NOP5_ATOMIC -}; -static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +const unsigned char * const x86_nops[ASM_NOP_MAX+1] = { NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops, + x86nops + 1, + x86nops + 1 + 2, + x86nops + 1 + 2 + 3, + x86nops + 1 + 2 + 3 + 4, + x86nops + 1 + 2 + 3 + 4 + 5, + x86nops + 1 + 2 + 3 + 4 + 5 + 6, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; -#endif - -#if defined(K7_NOP1) && !defined(CONFIG_X86_64) -static const unsigned char k7nops[] = -{ - K7_NOP1, - K7_NOP2, - K7_NOP3, - K7_NOP4, - K7_NOP5, - K7_NOP6, - K7_NOP7, - K7_NOP8, - K7_NOP5_ATOMIC -}; -static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = -{ - NULL, - k7nops, - k7nops + 1, - k7nops + 1 + 2, - k7nops + 1 + 2 + 3, - k7nops + 1 + 2 + 3 + 4, - k7nops + 1 + 2 + 3 + 4 + 5, - k7nops + 1 + 2 + 3 + 4 + 5 + 6, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -#ifdef P6_NOP1 -static const unsigned char p6nops[] = -{ - P6_NOP1, - P6_NOP2, - P6_NOP3, - P6_NOP4, - P6_NOP5, - P6_NOP6, - P6_NOP7, - P6_NOP8, - P6_NOP5_ATOMIC -}; -static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = -{ - NULL, - p6nops, - p6nops + 1, - p6nops + 1 + 2, - p6nops + 1 + 2 + 3, - p6nops + 1 + 2 + 3 + 4, - p6nops + 1 + 2 + 3 + 4 + 5, - p6nops + 1 + 2 + 3 + 4 + 5 + 6, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, - p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, -}; -#endif - -/* Initialize these to a safe default */ -#ifdef CONFIG_X86_64 -const unsigned char * const *ideal_nops = p6_nops; -#else -const unsigned char * const *ideal_nops = intel_nops; -#endif - -void __init arch_init_ideal_nops(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - /* - * Due to a decoder implementation quirk, some - * specific Intel CPUs actually perform better with - * the "k8_nops" than with the SDM-recommended NOPs. - */ - if (boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model >= 0x0f && - boot_cpu_data.x86_model != 0x1c && - boot_cpu_data.x86_model != 0x26 && - boot_cpu_data.x86_model != 0x27 && - boot_cpu_data.x86_model < 0x30) { - ideal_nops = k8_nops; - } else if (boot_cpu_has(X86_FEATURE_NOPL)) { - ideal_nops = p6_nops; - } else { -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - ideal_nops = intel_nops; -#endif - } - break; - - case X86_VENDOR_HYGON: - ideal_nops = p6_nops; - return; - - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 > 0xf) { - ideal_nops = p6_nops; - return; - } - - fallthrough; - - default: -#ifdef CONFIG_X86_64 - ideal_nops = k8_nops; -#else - if (boot_cpu_has(X86_FEATURE_K8)) - ideal_nops = k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - ideal_nops = k7_nops; - else - ideal_nops = intel_nops; -#endif - } -} /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) @@ -262,7 +107,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, ideal_nops[noplen], noplen); + memcpy(insns, x86_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -344,19 +189,35 @@ done: static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; - int i; + struct insn insn; + int nop, i = 0; + + /* + * Jump over the non-NOP insns, the remaining bytes must be single-byte + * NOPs, optimize them. + */ + for (;;) { + if (insn_decode_kernel(&insn, &instr[i])) + return; + + if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) + break; + + if ((i += insn.length) >= a->instrlen) + return; + } - for (i = 0; i < a->padlen; i++) { - if (instr[i] != 0x90) + for (nop = i; i < a->instrlen; i++) { + if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) return; } local_irq_save(flags); - add_nops(instr + (a->instrlen - a->padlen), a->padlen); + add_nops(instr + nop, i - nop); local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, a->instrlen - a->padlen, a->padlen); + instr, nop, a->instrlen); } /* @@ -388,23 +249,29 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + /* Mask away "NOT" flag bit for feature to test. */ + u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) { - if (a->padlen > 1) - optimize_nops(a, instr); + BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); - continue; - } + /* + * Patch if either: + * - feature is present + * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * patch if feature is *NOT* present. + */ + if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) + goto next; - DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d", - a->cpuid >> 5, - a->cpuid & 0x1f, + DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", + (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", + feature >> 5, + feature & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen, a->padlen); + replacement, a->replacementlen); DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -428,14 +295,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, if (a->replacementlen && is_jmp(replacement[0])) recompute_jump(a, instr, replacement, insn_buff); - if (a->instrlen > a->replacementlen) { - add_nops(insn_buff + a->replacementlen, - a->instrlen - a->replacementlen); - insn_buff_sz += a->instrlen - a->replacementlen; - } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) + insn_buff[insn_buff_sz] = 0x90; + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); text_poke_early(instr, insn_buff, insn_buff_sz); + +next: + optimize_nops(a, instr); } } @@ -605,7 +473,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insn_buff, p->instr, p->len); - used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); + used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -723,6 +591,33 @@ void __init alternative_instructions(void) * patching. */ + /* + * Paravirt patching and alternative patching can be combined to + * replace a function call with a short direct code sequence (e.g. + * by setting a constant return value instead of doing that in an + * external function). + * In order to make this work the following sequence is required: + * 1. set (artificial) features depending on used paravirt + * functions which can later influence alternative patching + * 2. apply paravirt patching (generally replacing an indirect + * function call with a direct one) + * 3. apply alternative patching (e.g. replacing a direct function + * call with a custom code sequence) + * Doing paravirt patching after alternative patching would clobber + * the optimization of the custom code with a function call again. + */ + paravirt_set_cap(); + + /* + * First patch paravirt functions, such that we overwrite the indirect + * call with the direct call. + */ + apply_paravirt(__parainstructions, __parainstructions_end); + + /* + * Then patch alternatives, such that those paravirt calls that are in + * alternatives can be overwritten by their immediate fragments. + */ apply_alternatives(__alt_instructions, __alt_instructions_end); #ifdef CONFIG_SMP @@ -741,8 +636,6 @@ void __init alternative_instructions(void) } #endif - apply_paravirt(__parainstructions, __parainstructions_end); - restart_nmi(); alternatives_patched = 1; } @@ -1274,15 +1167,15 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, const void *opcode, size_t len, const void *emulate) { struct insn insn; + int ret; memcpy((void *)tp->text, opcode, len); if (!emulate) emulate = opcode; - kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); - insn_get_length(&insn); + ret = insn_decode_kernel(&insn, emulate); - BUG_ON(!insn_complete(&insn)); + BUG_ON(ret < 0); BUG_ON(len != insn.length); tp->rel_addr = addr - (void *)_stext; @@ -1302,13 +1195,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ - BUG_ON(memcmp(emulate, ideal_nops[len], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP8_INSN_OPCODE; tp->rel32 = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ - BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + BUG_ON(memcmp(emulate, x86_nops[len], len)); tp->opcode = JMP32_INSN_OPCODE; tp->rel32 = 0; break; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b4396952c9a6..09083094eb57 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Shared support code for AMD K8 northbridges and derivates. + * Shared support code for AMD K8 northbridges and derivatives. * Copyright 2006 Andi Kleen, SUSE Labs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4f26700f314d..4a39fb429f15 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -619,7 +619,7 @@ static void setup_APIC_timer(void) if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; - /* Make LAPIC timer preferrable over percpu HPET */ + /* Make LAPIC timer preferable over percpu HPET */ lapic_clockevent.rating = 150; } @@ -666,7 +666,7 @@ void lapic_update_tsc_freq(void) * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * irqs synchronous. CPUs connected by the same APIC bus have the very same bus * frequency. * * This was previously done by reading the PIT/HPET and waiting for a wrap @@ -1532,7 +1532,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) * Most probably by now the CPU has serviced that pending interrupt and it * might not have done the ack_APIC_irq() because it thought, interrupt * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear - * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * the ISR bit and cpu thinks it has already serviced the interrupt. Hence * a vector might get locked. It was noticed for timer irq (vector * 0x31). Issue an extra EOI to clear ISR. * @@ -1657,7 +1657,7 @@ static void setup_local_APIC(void) */ /* * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more + * frequent as it makes the interrupt distribution model be more * like LRU than MRU (the short-term load is more even across CPUs). */ @@ -1875,7 +1875,7 @@ static __init void try_to_enable_x2apic(int remap_mode) /* * Without IR, all CPUs can be addressed by IOAPIC/MSI only - * in physical mode, and CPUs with an APIC ID that cannnot + * in physical mode, and CPUs with an APIC ID that cannot * be addressed must not be brought online. */ x2apic_set_max_apicid(apic_limit); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 73ff4dd426a8..d5c691a3208b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -928,7 +928,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) /* * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger - * and polarity attirbutes. So allow the first user to reprogram the + * and polarity attributes. So allow the first user to reprogram the * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { @@ -994,7 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, /* * Legacy ISA IRQ has already been allocated, just add pin to - * the pin list assoicated with this IRQ and program the IOAPIC + * the pin list associated with this IRQ and program the IOAPIC * entry. The IOAPIC entry */ if (irq_data && irq_data->parent_data) { @@ -1752,7 +1752,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit) * with masking the ioapic entry and then polling until * Remote IRR was clear before reprogramming the * ioapic I don't trust the Remote IRR bit to be - * completey accurate. + * completely accurate. * * However there appears to be no other way to plug * this race, so if the Remote IRR bit is not @@ -1830,7 +1830,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) /* * Tail end of clearing remote IRR bit (either by delivering the EOI * message via io-apic EOI register write or simulating it using - * mask+edge followed by unnask+level logic) manually when the + * mask+edge followed by unmask+level logic) manually when the * level triggered interrupt is seen as the edge triggered interrupt * at the cpu. */ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3c9c7492252f..6dbdc7c22bb7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -543,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1) return -ENOSYS; + /* + * Catch any attempt to touch the cascade interrupt on a PIC + * equipped system. + */ + if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY && + virq == PIC_CASCADE_IR)) + return -EINVAL; + for (i = 0; i < nr_irqs; i++) { irqd = irq_domain_get_irq_data(domain, virq + i); BUG_ON(!irqd); @@ -745,6 +753,11 @@ void __init lapic_assign_system_vectors(void) /* Mark the preallocated legacy interrupts */ for (i = 0; i < nr_legacy_irqs(); i++) { + /* + * Don't touch the cascade interrupt. It's unusable + * on PIC equipped machines. See the large comment + * in the IO/APIC code. + */ if (i != PIC_CASCADE_IR) irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i)); } @@ -1045,7 +1058,7 @@ void irq_force_complete_move(struct irq_desc *desc) * * But in case of cpu hotplug this should be a non issue * because if the affinity update happens right before all - * cpus rendevouz in stop machine, there is no way that the + * cpus rendezvous in stop machine, there is no way that the * interrupt can be blocked on the target cpu because all cpus * loops first with interrupts enabled in stop machine, so the * old vector is not yet cleaned up when the interrupt fires. @@ -1054,7 +1067,7 @@ void irq_force_complete_move(struct irq_desc *desc) * of the interrupt on the apic/system bus would be delayed * beyond the point where the target cpu disables interrupts * in stop machine. I doubt that it can happen, but at least - * there is a theroretical chance. Virtualization might be + * there is a theoretical chance. Virtualization might be * able to expose this, but AFAICT the IOAPIC emulation is not * as stupid as the real hardware. * diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 52bc217ca8c3..f5a48e66e4f5 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -369,6 +369,15 @@ static int __init early_get_arch_type(void) return ret; } +/* UV system found, check which APIC MODE BIOS already selected */ +static void __init early_set_apic_mode(void) +{ + if (x2apic_enabled()) + uv_system_type = UV_X2APIC; + else + uv_system_type = UV_LEGACY_APIC; +} + static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) { /* Save OEM_ID passed from ACPI MADT */ @@ -404,11 +413,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) else uv_hubless_system |= 0x8; - /* Copy APIC type */ + /* Copy OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n", oem_id, oem_table_id, uv_system_type, uv_hubless_system); + return 0; } @@ -453,6 +463,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id) early_set_hub_type(); /* Other UV setup functions */ + early_set_apic_mode(); early_get_pnodeid(); early_get_apic_socketid_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; @@ -472,29 +483,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) if (uv_set_system_type(_oem_id, _oem_table_id) == 0) return 0; - /* Save and Decode OEM Table ID */ + /* Save for display of the OEM Table ID */ uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); - /* This is the most common hardware variant, x2apic mode */ - if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - - /* Only used for very small systems, usually 1 chassis, legacy mode */ - else if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - - else - goto badbios; - pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n", oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY), uv_min_hub_revision_id); return 0; - -badbios: - pr_err("UV: UVarchtype:%s not supported\n", uv_archtype); - BUG(); } enum uv_system_type get_uv_system_type(void) @@ -1671,6 +1667,9 @@ static __init int uv_system_init_hubless(void) if (rc < 0) return rc; + /* Set section block size for current node memory */ + set_block_size(); + /* Create user access node */ if (rc >= 0) uv_setup_proc_files(1); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 660270359d39..241dda687eb9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -94,7 +94,7 @@ * Remove APM dependencies in arch/i386/kernel/process.c * Remove APM dependencies in drivers/char/sysrq.c * Reset time across standby. - * Allow more inititialisation on SMP. + * Allow more initialisation on SMP. * Remove CONFIG_APM_POWER_OFF and make it boot time * configurable (default on). * Make debug only a boot time parameter (remove APM_DEBUG). @@ -766,7 +766,7 @@ static int apm_driver_version(u_short *val) * not cleared until it is acknowledged. * * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 + * that APM 1.2 is in use. If no messages are pending the value 0x80 * is returned (No power management events pending). */ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) @@ -1025,7 +1025,7 @@ static int apm_enable_power_management(int enable) * status which gives the rough battery status, and current power * source. The bat value returned give an estimate as a percentage * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer + * if reported is a lifetime in seconds/minutes at current power * consumption. */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 60b9f42ce3c1..ecd3fd6993d1 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -61,13 +61,6 @@ static void __used common(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -#ifdef CONFIG_PARAVIRT_XXL - BLANK(); - OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable); - OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable); - OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret); -#endif - #ifdef CONFIG_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 6e043f295a60..2b411cd00a4e 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -53,11 +53,6 @@ void foo(void) offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); -#ifdef CONFIG_STACKPROTECTOR - BLANK(); - OFFSET(stack_canary_offset, stack_canary, canary); -#endif - BLANK(); DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map)); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 347a956f71ca..2d11384dc9ab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -628,11 +628,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); -#ifdef CONFIG_X86_32 - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_K7); -#endif - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 3ca9be482a9e..d66af2950e06 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -877,7 +877,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct _cpuid4_info_regs *base) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *this_cpu_ci; struct cacheinfo *this_leaf; int i, sibling; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ab640abe26b6..6bdb69a9a7dc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -161,7 +161,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - GDT_STACK_CANARY_INIT #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -482,7 +481,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) if (pk) pk->pkru = init_pkru_value; /* - * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE + * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we * update that bit in this CPU's "cpu_info". */ @@ -599,7 +598,6 @@ void load_percpu_segment(int cpu) __loadsegment_simple(gs, 0); wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #endif - load_stack_canary_segment(); } #ifdef CONFIG_X86_32 @@ -1330,7 +1328,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) cpu_set_bug_bits(c); - cpu_set_core_cap_bits(c); + sld_setup(c); fpu__init_system(c); @@ -1404,7 +1402,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) * where GS is unused by the prev and next threads. * * Since neither vendor documents this anywhere that I can see, - * detect it directly instead of hardcoding the choice by + * detect it directly instead of hard-coding the choice by * vendor. * * I've designated AMD's behavior as the "bug" because it's @@ -1748,6 +1746,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse); DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; + /* May not be marked __init: used by software suspend */ void syscall_init(void) { @@ -1796,7 +1796,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 42af31b64c2c..defda61f372d 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA }, + { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, + { X86_FEATURE_SGX1, X86_FEATURE_SGX }, + { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, {} }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 1d9b8aaea06c..7227c15299d0 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif - c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */ /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 3b1b01f2b248..da696eb4821a 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) } #endif /* CONFIG_X86_VMX_FEATURE_NAMES */ -static void clear_sgx_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_SGX); - setup_clear_cpu_cap(X86_FEATURE_SGX_LC); -} - static int __init nosgx(char *str) { - clear_sgx_caps(); + setup_clear_cpu_cap(X86_FEATURE_SGX); return 0; } @@ -110,23 +104,30 @@ early_param("nosgx", nosgx); void init_ia32_feat_ctl(struct cpuinfo_x86 *c) { + bool enable_sgx_kvm = false, enable_sgx_driver = false; bool tboot = tboot_enabled(); - bool enable_sgx; + bool enable_vmx; u64 msr; if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); - clear_sgx_caps(); + clear_cpu_cap(c, X86_FEATURE_SGX); return; } - /* - * Enable SGX if and only if the kernel supports SGX and Launch Control - * is supported, i.e. disable SGX if the LE hash MSRs can't be written. - */ - enable_sgx = cpu_has(c, X86_FEATURE_SGX) && - cpu_has(c, X86_FEATURE_SGX_LC) && - IS_ENABLED(CONFIG_X86_SGX); + enable_vmx = cpu_has(c, X86_FEATURE_VMX) && + IS_ENABLED(CONFIG_KVM_INTEL); + + if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) { + /* + * Separate out SGX driver enabling from KVM. This allows KVM + * guests to use SGX even if the kernel SGX driver refuses to + * use it. This happens if flexible Launch Control is not + * available. + */ + enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC); + enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM); + } if (msr & FEAT_CTL_LOCKED) goto update_caps; @@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector * for the kernel, e.g. using VMX to hide malicious code. */ - if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) { + if (enable_vmx) { msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; if (tboot) msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX; } - if (enable_sgx) - msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED; + if (enable_sgx_kvm || enable_sgx_driver) { + msr |= FEAT_CTL_SGX_ENABLED; + if (enable_sgx_driver) + msr |= FEAT_CTL_SGX_LC_ENABLED; + } wrmsrl(MSR_IA32_FEAT_CTL, msr); @@ -173,10 +177,29 @@ update_caps: } update_sgx: - if (!(msr & FEAT_CTL_SGX_ENABLED) || - !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) { - if (enable_sgx) - pr_err_once("SGX disabled by BIOS\n"); - clear_sgx_caps(); + if (!(msr & FEAT_CTL_SGX_ENABLED)) { + if (enable_sgx_kvm || enable_sgx_driver) + pr_err_once("SGX disabled by BIOS.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + return; + } + + /* + * VMX feature bit may be cleared due to being disabled in BIOS, + * in which case SGX virtualization cannot be supported either. + */ + if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) { + pr_err_once("SGX virtualization disabled due to lack of VMX.\n"); + enable_sgx_kvm = 0; + } + + if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) { + if (!enable_sgx_kvm) { + pr_err_once("SGX Launch Control is locked. Disable SGX.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX); + } else { + pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n"); + clear_cpu_cap(c, X86_FEATURE_SGX_LC); + } } } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index ae59115d18f9..0bd6c74e3ba1 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -215,12 +215,12 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) u32 ecx; ecx = cpuid_ecx(0x8000001e); - nodes_per_socket = ((ecx >> 8) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); - nodes_per_socket = ((value >> 3) & 7) + 1; + __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; } if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0e422a544835..8adffc17fa8b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -44,9 +44,9 @@ enum split_lock_detect_state { }; /* - * Default to sld_off because most systems do not support split lock detection - * split_lock_setup() will switch this to sld_warn on systems that support - * split lock detect, unless there is a command line override. + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. */ static enum split_lock_detect_state sld_state __ro_after_init = sld_off; static u64 msr_test_ctrl_cache __ro_after_init; @@ -301,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * The operating system must reload CR3 to cause the TLB to be flushed" * * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h - * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE + * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ if (c->x86 == 5 && c->x86_model == 9) { @@ -603,6 +603,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) } static void split_lock_init(void); +static void bus_lock_init(void); static void init_intel(struct cpuinfo_x86 *c) { @@ -720,6 +721,7 @@ static void init_intel(struct cpuinfo_x86 *c) tsx_disable(); split_lock_init(); + bus_lock_init(); intel_init_thermal(c); } @@ -1020,16 +1022,15 @@ static bool split_lock_verify_msr(bool on) return ctrl == tmp; } -static void __init split_lock_setup(void) +static void __init sld_state_setup(void) { enum split_lock_detect_state state = sld_warn; char arg[20]; int i, ret; - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - } ret = cmdline_find_option(boot_command_line, "split_lock_detect", arg, sizeof(arg)); @@ -1041,17 +1042,14 @@ static void __init split_lock_setup(void) } } } + sld_state = state; +} - switch (state) { - case sld_off: - pr_info("disabled\n"); +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); return; - case sld_warn: - pr_info("warning about user-space split_locks\n"); - break; - case sld_fatal: - pr_info("sending SIGBUS on user-space split_locks\n"); - break; } rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); @@ -1061,7 +1059,9 @@ static void __init split_lock_setup(void) return; } - sld_state = state; + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -1118,6 +1118,29 @@ bool handle_guest_split_lock(unsigned long ip) } EXPORT_SYMBOL_GPL(handle_guest_split_lock); +static void bus_lock_init(void) +{ + u64 val; + + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) || + (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) + return; + + /* + * Enable #DB for bus lock. All bus locks are handled in #DB except + * split locks are handled in #AC in the fatal case. + */ + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + bool handle_user_split_lock(struct pt_regs *regs, long error_code) { if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) @@ -1126,6 +1149,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code) return true; } +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + /* * This function is called only when switching between tasks with * different split-lock detection modes. It sets the MSR for the @@ -1166,7 +1204,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { {} }; -void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) +static void __init split_lock_setup(struct cpuinfo_x86 *c) { const struct x86_cpu_id *m; u64 ia32_core_caps; @@ -1193,5 +1231,56 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) } cpu_model_supports_sld = true; - split_lock_setup(); + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: warning on user-space bus_locks\n"); + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} + +#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 + +/** + * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU + * + * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in + * a hybrid processor. If the processor is not hybrid, returns 0. + */ +u8 get_this_hybrid_cpu_type(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7962355436da..bf7fe87a7e88 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -529,7 +529,7 @@ static void mce_irq_work_cb(struct irq_work *entry) * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses up to page granuality for now. + * parser). So only support physical addresses up to page granularity for now. */ int mce_usable_address(struct mce *m) { diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 7b360731fc2d..4e86d97f9653 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -74,6 +74,7 @@ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); MCE_INJECT_SET(synd); +MCE_INJECT_SET(ipid); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -88,11 +89,13 @@ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); MCE_INJECT_GET(synd); +MCE_INJECT_GET(ipid); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n"); static void setup_inj_struct(struct mce *m) { @@ -629,6 +632,8 @@ static const char readme_msg[] = "\t is present in hardware. \n" "\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n" "\t APIC interrupt handler to handle the error. \n" +"\n" +"ipid:\t IPID (AMD-specific)\n" "\n"; static ssize_t @@ -652,6 +657,7 @@ static struct dfs_node { { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index e309476743b7..acfd5d9f93c6 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 83df991314c5..17e631443116 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -142,7 +142,7 @@ static struct severity { MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) ), MCESEV( - KEEP, "Non signalled machine check", + KEEP, "Non signaled machine check", SER, BITCLR(MCI_STATUS_S) ), @@ -218,15 +218,15 @@ static struct severity { static bool is_copy_from_user(struct pt_regs *regs) { u8 insn_buf[MAX_INSN_SIZE]; - struct insn insn; unsigned long addr; + struct insn insn; + int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return false; - kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); - insn_get_opcode(&insn); - if (!insn.opcode.got) + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) return false; switch (insn.opcode.value) { @@ -234,10 +234,6 @@ static bool is_copy_from_user(struct pt_regs *regs) case 0x8A: case 0x8B: /* MOVZ mem,reg */ case 0xB60F: case 0xB70F: - insn_get_modrm(&insn); - insn_get_sib(&insn); - if (!insn.modrm.got || !insn.sib.got) - return false; addr = (unsigned long)insn_get_addr_ref(&insn, regs); break; /* REP MOVS */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b935e1b5f115..6a6318e9590c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; - tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_NEW) - return size; - get_online_cpus(); ret = check_online_cpus(); if (ret) goto put; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_NEW) + goto put; + mutex_lock(µcode_mutex); ret = microcode_reload_late(); mutex_unlock(µcode_mutex); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e88bc296afca..22f13343b5da 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -60,23 +60,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } -int hv_setup_vmbus_irq(int irq, void (*handler)(void)) +void hv_setup_vmbus_handler(void (*handler)(void)) { - /* - * The 'irq' argument is ignored on x86/x64 because a hard-coded - * interrupt vector is used for Hyper-V interrupts. - */ vmbus_handler = handler; - return 0; } +EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler); -void hv_remove_vmbus_irq(void) +void hv_remove_vmbus_handler(void) { /* We have no way to deallocate the interrupt gate */ vmbus_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); -EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler); /* * Routines to do per-architecture handling of stimer0 @@ -95,21 +90,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) set_irq_regs(old_regs); } -int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) +/* For x86/x64, override weak placeholders in hyperv_timer.c */ +void hv_setup_stimer0_handler(void (*handler)(void)) { - *vector = HYPERV_STIMER0_VECTOR; - *irq = -1; /* Unused on x86/x64 */ hv_stimer0_handler = handler; - return 0; } -EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq); -void hv_remove_stimer0_irq(int irq) +void hv_remove_stimer0_handler(void) { /* We have no way to deallocate the interrupt gate */ hv_stimer0_handler = NULL; } -EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq); void hv_setup_kexec_handler(void (*handler)(void)) { @@ -197,7 +188,7 @@ static unsigned char hv_get_nmi_reason(void) #ifdef CONFIG_X86_LOCAL_APIC /* * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes - * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle + * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle * unknown NMI on the first CPU which gets it. */ static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs) @@ -274,12 +265,13 @@ static void __init ms_hyperv_init_platform(void) * Extract the features and hints */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); - ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features); + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", + ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -325,7 +317,7 @@ static void __init ms_hyperv_init_platform(void) x86_platform.calibrate_cpu = hv_get_tsc_khz; } - if (ms_hyperv.features_b & HV_ISOLATION) { + if (ms_hyperv.priv_high & HV_ISOLATION) { ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG); ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG); @@ -428,7 +420,7 @@ static void __init ms_hyperv_init_platform(void) /* * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic, - * set x2apic destination mode to physcial mode when x2apic is available + * set x2apic destination mode to physical mode when x2apic is available * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs * have 8-bit APIC id. */ diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 9231640782fa..0c3b372318b7 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, state->range_sizek = sizek - second_sizek; } -/* Mininum size of mtrr block that can take hole: */ +/* Minimum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 28c8a23aa42e..a76694bffe86 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -799,7 +799,7 @@ void mtrr_ap_init(void) * * This routine is called in two cases: * - * 1. very earily time of software resume, when there absolutely + * 1. very early time of software resume, when there absolutely * isn't mtrr entry changes; * * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 698bb26aeb6e..23001ae03e82 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -192,7 +192,7 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid) * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz * - * Probe by trying to write the first of the L3 cach mask registers + * Probe by trying to write the first of the L3 cache mask registers * and checking that the bits stick. Max CLOSids is always 4 and max cbm length * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ac31210e452..dbeaa8409313 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -387,7 +387,7 @@ void mon_event_count(void *info) * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so * that: * - * current bandwdith(cur_bw) < user specified bandwidth(user_bw) + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) * * This uses the MBM counters to measure the bandwidth and MBA throttle * MSRs to control the bandwidth for a particular rdtgrp. It builds on the @@ -397,7 +397,7 @@ void mon_event_count(void *info) * timer. Having 1s interval makes the calculation of bandwidth simpler. * * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid uncecessarily restricting + * be a need to increase the bandwidth to avoid unnecessarily restricting * the L2 <-> L3 traffic. * * Since MBA controls the L2 external bandwidth where as MBM measures the @@ -480,7 +480,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) /* * Delta values are updated dynamically package wise for each - * rdtgrp everytime the throttle MSR changes value. + * rdtgrp every time the throttle MSR changes value. * * This is because (1)the increase in bandwidth is not perfectly * linear and only "approximately" linear even when the hardware diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..935af2ac6b1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1307,7 +1307,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * If the thread does not get on the CPU for whatever * reason and the process which sets up the region is * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will derefence + * state and once it gets on the CPU it will dereference * the cleared, but not freed, plr struct resulting in an * empty pseudo-locking loop. */ @@ -1391,7 +1391,7 @@ out: * group is removed from user space via a "rmdir" from userspace or the * unmount of the resctrl filesystem. On removal the resource group does * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus assymmetry with the creation where the + * removed directly. There is thus asymmetry with the creation where the * &struct pseudo_lock_region is removed here while it was not created in * rdtgroup_pseudo_lock_create(). * diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index f9190adc52cb..01fd30e7829d 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * User interface for Resource Alloction in Resource Director Technology(RDT) + * User interface for Resource Allocation in Resource Director Technology(RDT) * * Copyright (C) 2016 Intel Corporation * @@ -294,7 +294,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against resctrl_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from update_closid_rmid() is proteced against __switch_to() because + * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ static void update_cpu_closid_rmid(void *info) @@ -2555,7 +2555,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, /* * This creates a directory mon_data which contains the monitored data. * - * mon_data has one directory for each domain whic are named + * mon_data has one directory for each domain which are named * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data * with L3 domain looks as below: * ./mon_data: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 972ec3bfa9c0..21d1f062895a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index 91d3dc784a29..9c1656779b2a 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -3,3 +3,4 @@ obj-y += \ encl.o \ ioctl.o \ main.o +obj-$(CONFIG_X86_SGX_KVM) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 8ce6d8371cfb..aa9b8b868867 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct miscdevice sgx_dev_enclave = { .minor = MISC_DYNAMIC_MINOR, .name = "sgx_enclave", @@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = { .fops = &sgx_encl_fops, }; -static struct miscdevice sgx_dev_provision = { - .minor = MISC_DYNAMIC_MINOR, - .name = "sgx_provision", - .nodename = "sgx_provision", - .fops = &sgx_provision_fops, -}; - int __init sgx_drv_init(void) { unsigned int eax, ebx, ecx, edx; @@ -187,11 +176,5 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = misc_register(&sgx_dev_provision); - if (ret) { - misc_deregister(&sgx_dev_enclave); - return ret; - } - return 0; } diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 7449ef33f081..3be203297988 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include <linux/shmem_fs.h> #include <linux/suspend.h> #include <linux/sched/mm.h> -#include "arch.h" +#include <asm/sgx.h> #include "encl.h" #include "encls.h" #include "sgx.h" @@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = __sgx_encl_eldu(encl_page, epc_page, secs_page); if (ret) { - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(ret); } @@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref) if (sgx_unmark_page_reclaimable(entry->epc_page)) continue; - sgx_free_epc_page(entry->epc_page); + sgx_encl_free_epc_page(entry->epc_page); encl->secs_child_cnt--; entry->epc_page = NULL; } @@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref) xa_destroy(&encl->page_array); if (!encl->secs_child_cnt && encl->secs.epc_page) { - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; } @@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref) va_page = list_first_entry(&encl->va_pages, struct sgx_va_page, list); list_del(&va_page->list); - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); kfree(va_page); } @@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void) ret = __epa(sgx_get_epc_virt_addr(epc_page)); if (ret) { WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret); - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); return ERR_PTR(-EFAULT); } @@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page) return slot == SGX_VA_SLOT_COUNT; } + +/** + * sgx_encl_free_epc_page - free an EPC page assigned to an enclave + * @page: EPC page to be freed + * + * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and + * only upon success, it puts the page back to free page list. Otherwise, it + * gives a WARNING to indicate page is leaked. + */ +void sgx_encl_free_epc_page(struct sgx_epc_page *page) +{ + int ret; + + WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + + ret = __eremove(sgx_get_epc_virt_addr(page)); + if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret)) + return; + + sgx_free_epc_page(page); +} diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index d8d30ccbef4c..6e74f85b6264 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset); bool sgx_va_page_full(struct sgx_va_page *va_page); +void sgx_encl_free_epc_page(struct sgx_epc_page *page); #endif /* _X86_ENCL_H */ diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 443188fe7e70..9b204843b78d 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -11,21 +11,6 @@ #include <asm/traps.h> #include "sgx.h" -enum sgx_encls_function { - ECREATE = 0x00, - EADD = 0x01, - EINIT = 0x02, - EREMOVE = 0x03, - EDGBRD = 0x04, - EDGBWR = 0x05, - EEXTEND = 0x06, - ELDU = 0x08, - EBLOCK = 0x09, - EPA = 0x0A, - EWB = 0x0B, - ETRACK = 0x0C, -}; - /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr * @@ -55,6 +40,19 @@ enum sgx_encls_function { } while (0); \ } +/* + * encls_faulted() - Check if an ENCLS leaf faulted given an error code + * @ret: the return value of an ENCLS leaf function call + * + * Return: + * - true: ENCLS leaf faulted. + * - false: Otherwise. + */ +static inline bool encls_faulted(int ret) +{ + return ret & ENCLS_FAULT_FLAG; +} + /** * encls_failed() - Check if an ENCLS function failed * @ret: the return value of an ENCLS function call @@ -65,7 +63,7 @@ enum sgx_encls_function { */ static inline bool encls_failed(int ret) { - if (ret & ENCLS_FAULT_FLAG) + if (encls_faulted(ret)) return ENCLS_TRAPNR(ret) != X86_TRAP_PF; return !!ret; diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 90a5caf76939..83df20e3e633 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -2,6 +2,7 @@ /* Copyright(c) 2016-20 Intel Corporation. */ #include <asm/mman.h> +#include <asm/sgx.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page) encl->page_cnt--; if (va_page) { - sgx_free_epc_page(va_page->epc_page); + sgx_encl_free_epc_page(va_page->epc_page); list_del(&va_page->list); kfree(va_page); } @@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) return 0; err_out: - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; err_out_backing: @@ -365,7 +366,7 @@ err_out_unlock: mmap_read_unlock(current->mm); err_out_free: - sgx_free_epc_page(epc_page); + sgx_encl_free_epc_page(epc_page); kfree(encl_page); return ret; @@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { u64 mrsigner[4]; - int i, j, k; + int i, j; void *addr; int ret; @@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, preempt_disable(); - for (k = 0; k < 4; k++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]); + sgx_update_lepubkeyhash(mrsigner); ret = __einit(sigstruct, token, addr); @@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, } } - if (ret & ENCLS_FAULT_FLAG) { + if (encls_faulted(ret)) { if (encls_failed(ret)) ENCLS_WARN(ret, "EINIT"); @@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) { struct sgx_sigstruct *sigstruct; struct sgx_enclave_init init_arg; - struct page *initp_page; void *token; int ret; @@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) if (copy_from_user(&init_arg, arg, sizeof(init_arg))) return -EFAULT; - initp_page = alloc_page(GFP_KERNEL); - if (!initp_page) + /* + * 'sigstruct' must be on a page boundary and 'token' on a 512 byte + * boundary. kmalloc() will give this alignment when allocating + * PAGE_SIZE bytes. + */ + sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!sigstruct) return -ENOMEM; - sigstruct = kmap(initp_page); token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2); memset(token, 0, SGX_LAUNCH_TOKEN_SIZE); @@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg) ret = sgx_encl_init(encl, sigstruct, token); out: - kunmap(initp_page); - __free_page(initp_page); + kfree(sigstruct); return ret; } @@ -665,24 +667,11 @@ out: static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg) { struct sgx_enclave_provision params; - struct file *file; if (copy_from_user(¶ms, arg, sizeof(params))) return -EFAULT; - file = fget(params.fd); - if (!file) - return -EINVAL; - - if (file->f_op != &sgx_provision_fops) { - fput(file); - return -EINVAL; - } - - encl->attributes_mask |= SGX_ATTR_PROVISIONKEY; - - fput(file); - return 0; + return sgx_set_attribute(&encl->attributes_mask, params.fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8df81a3ed945..63d3de02bbcc 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2016-20 Intel Corporation. */ +#include <linux/file.h> #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/miscdevice.h> #include <linux/pagemap.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> +#include <asm/sgx.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq); * with sgx_reclaimer_lock acquired. */ static LIST_HEAD(sgx_active_page_list); - static DEFINE_SPINLOCK(sgx_reclaimer_lock); +/* The free page list lock protected variables prepend the lock. */ +static unsigned long sgx_nr_free_pages; + +/* Nodes with one or more EPC sections. */ +static nodemask_t sgx_numa_mask; + +/* + * Array with one list_head for each possible NUMA node. Each + * list contains all the sgx_epc_section's which are on that + * node. + */ +static struct sgx_numa_node *sgx_numa_nodes; + +static LIST_HEAD(sgx_dirty_page_list); + /* - * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS - * pages whose child pages blocked EREMOVE. + * Reset post-kexec EPC pages to the uninitialized state. The pages are removed + * from the input list, and made available for the page allocator. SECS pages + * prepending their children in the input list are left intact. */ -static void sgx_sanitize_section(struct sgx_epc_section *section) +static void __sgx_sanitize_pages(struct list_head *dirty_page_list) { struct sgx_epc_page *page; LIST_HEAD(dirty); int ret; - /* init_laundry_list is thread-local, no need for a lock: */ - while (!list_empty(§ion->init_laundry_list)) { + /* dirty_page_list is thread-local, no need for a lock: */ + while (!list_empty(dirty_page_list)) { if (kthread_should_stop()) return; - /* needed for access to ->page_list: */ - spin_lock(§ion->lock); - - page = list_first_entry(§ion->init_laundry_list, - struct sgx_epc_page, list); + page = list_first_entry(dirty_page_list, struct sgx_epc_page, list); ret = __eremove(sgx_get_epc_virt_addr(page)); - if (!ret) - list_move(&page->list, §ion->page_list); - else + if (!ret) { + /* + * page is now sanitized. Make it available via the SGX + * page allocator: + */ + list_del(&page->list); + sgx_free_epc_page(page); + } else { + /* The page is not yet clean - move to the dirty list. */ list_move_tail(&page->list, &dirty); - - spin_unlock(§ion->lock); + } cond_resched(); } - list_splice(&dirty, §ion->init_laundry_list); + list_splice(&dirty, dirty_page_list); } static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page) @@ -195,10 +214,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl) /* * Swap page to the regular memory transformed to the blocked state by using - * EBLOCK, which means that it can no loger be referenced (no new TLB entries). + * EBLOCK, which means that it can no longer be referenced (no new TLB entries). * * The first trial just tries to write the page assuming that some other thread - * has reset the count for threads inside the enlave by using ETRACK, and + * has reset the count for threads inside the enclave by using ETRACK, and * previous thread count has been zeroed out. The second trial calls ETRACK * before EWB. If that fails we kick all the HW threads out, and then do EWB, * which should be guaranteed the succeed. @@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page, sgx_encl_ewb(encl->secs.epc_page, &secs_backing); - sgx_free_epc_page(encl->secs.epc_page); + sgx_encl_free_epc_page(encl->secs.epc_page); encl->secs.epc_page = NULL; sgx_encl_put_backing(&secs_backing, true); @@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void) struct sgx_epc_section *section; struct sgx_encl_page *encl_page; struct sgx_epc_page *epc_page; + struct sgx_numa_node *node; pgoff_t page_index; int cnt = 0; int ret; @@ -379,50 +399,33 @@ skip: epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED; section = &sgx_epc_sections[epc_page->section]; - spin_lock(§ion->lock); - list_add_tail(&epc_page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); - } -} - -static unsigned long sgx_nr_free_pages(void) -{ - unsigned long cnt = 0; - int i; - - for (i = 0; i < sgx_nr_epc_sections; i++) - cnt += sgx_epc_sections[i].free_cnt; + node = section->node; - return cnt; + spin_lock(&node->lock); + list_add_tail(&epc_page->list, &node->free_page_list); + sgx_nr_free_pages++; + spin_unlock(&node->lock); + } } static bool sgx_should_reclaim(unsigned long watermark) { - return sgx_nr_free_pages() < watermark && - !list_empty(&sgx_active_page_list); + return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list); } static int ksgxd(void *p) { - int i; - set_freezable(); /* * Sanitize pages in order to recover from kexec(). The 2nd pass is * required for SECS pages, whose child pages blocked EREMOVE. */ - for (i = 0; i < sgx_nr_epc_sections; i++) - sgx_sanitize_section(&sgx_epc_sections[i]); - - for (i = 0; i < sgx_nr_epc_sections; i++) { - sgx_sanitize_section(&sgx_epc_sections[i]); + __sgx_sanitize_pages(&sgx_dirty_page_list); + __sgx_sanitize_pages(&sgx_dirty_page_list); - /* Should never happen. */ - if (!list_empty(&sgx_epc_sections[i].init_laundry_list)) - WARN(1, "EPC section %d has unsanitized pages.\n", i); - } + /* sanity check: */ + WARN_ON(!list_empty(&sgx_dirty_page_list)); while (!kthread_should_stop()) { if (try_to_freeze()) @@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void) return true; } -static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section) +static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid) { - struct sgx_epc_page *page; + struct sgx_numa_node *node = &sgx_numa_nodes[nid]; + struct sgx_epc_page *page = NULL; - spin_lock(§ion->lock); + spin_lock(&node->lock); - if (list_empty(§ion->page_list)) { - spin_unlock(§ion->lock); + if (list_empty(&node->free_page_list)) { + spin_unlock(&node->lock); return NULL; } - page = list_first_entry(§ion->page_list, struct sgx_epc_page, list); + page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list); list_del_init(&page->list); - section->free_cnt--; + sgx_nr_free_pages--; + + spin_unlock(&node->lock); - spin_unlock(§ion->lock); return page; } /** * __sgx_alloc_epc_page() - Allocate an EPC page * - * Iterate through EPC sections and borrow a free EPC page to the caller. When a - * page is no longer needed it must be released with sgx_free_epc_page(). + * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start + * from the NUMA node, where the caller is executing. * * Return: - * an EPC page, - * -errno on error + * - an EPC page: A borrowed EPC pages were available. + * - NULL: Out of EPC pages. */ struct sgx_epc_page *__sgx_alloc_epc_page(void) { - struct sgx_epc_section *section; struct sgx_epc_page *page; - int i; + int nid_of_current = numa_node_id(); + int nid = nid_of_current; - for (i = 0; i < sgx_nr_epc_sections; i++) { - section = &sgx_epc_sections[i]; + if (node_isset(nid_of_current, sgx_numa_mask)) { + page = __sgx_alloc_epc_page_from_node(nid_of_current); + if (page) + return page; + } + + /* Fall back to the non-local NUMA nodes: */ + while (true) { + nid = next_node_in(nid, sgx_numa_mask); + if (nid == nid_of_current) + break; - page = __sgx_alloc_epc_page_from_section(section); + page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; } @@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim) * sgx_free_epc_page() - Free an EPC page * @page: an EPC page * - * Call EREMOVE for an EPC page and insert it back to the list of free pages. + * Put the EPC page back to the list of free pages. It's the caller's + * responsibility to make sure that the page is in uninitialized state. In other + * words, do EREMOVE, EWB or whatever operation is necessary before calling + * this function. */ void sgx_free_epc_page(struct sgx_epc_page *page) { struct sgx_epc_section *section = &sgx_epc_sections[page->section]; - int ret; + struct sgx_numa_node *node = section->node; - WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED); + spin_lock(&node->lock); - ret = __eremove(sgx_get_epc_virt_addr(page)); - if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret)) - return; + list_add_tail(&page->list, &node->free_page_list); + sgx_nr_free_pages++; - spin_lock(§ion->lock); - list_add_tail(&page->list, §ion->page_list); - section->free_cnt++; - spin_unlock(§ion->lock); + spin_unlock(&node->lock); } static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, @@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, } section->phys_addr = phys_addr; - spin_lock_init(§ion->lock); - INIT_LIST_HEAD(§ion->page_list); - INIT_LIST_HEAD(§ion->init_laundry_list); for (i = 0; i < nr_pages; i++) { section->pages[i].section = index; section->pages[i].flags = 0; section->pages[i].owner = NULL; - list_add_tail(§ion->pages[i].list, §ion->init_laundry_list); + list_add_tail(§ion->pages[i].list, &sgx_dirty_page_list); } - section->free_cnt = nr_pages; return true; } @@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void) { u32 eax, ebx, ecx, edx, type; u64 pa, size; + int nid; int i; + sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL); + if (!sgx_numa_nodes) + return false; + for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) { cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx); @@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void) break; } + nid = numa_map_to_online_node(phys_to_target_node(pa)); + if (nid == NUMA_NO_NODE) { + /* The physical address is already printed above. */ + pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n"); + nid = 0; + } + + if (!node_isset(nid, sgx_numa_mask)) { + spin_lock_init(&sgx_numa_nodes[nid].lock); + INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list); + node_set(nid, sgx_numa_mask); + } + + sgx_epc_sections[i].node = &sgx_numa_nodes[nid]; + sgx_nr_epc_sections++; } @@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void) return true; } +/* + * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller. + * Bare-metal driver requires to update them to hash of enclave's signer + * before EINIT. KVM needs to update them to guest's virtual MSR values + * before doing EINIT from guest. + */ +void sgx_update_lepubkeyhash(u64 *lepubkeyhash) +{ + int i; + + WARN_ON_ONCE(preemptible()); + + for (i = 0; i < 4; i++) + wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); +} + +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct miscdevice sgx_dev_provision = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_provision", + .nodename = "sgx_provision", + .fops = &sgx_provision_fops, +}; + +/** + * sgx_set_attribute() - Update allowed attributes given file descriptor + * @allowed_attributes: Pointer to allowed enclave attributes + * @attribute_fd: File descriptor for specific attribute + * + * Append enclave attribute indicated by file descriptor to allowed + * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by + * /dev/sgx_provision is supported. + * + * Return: + * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes + * -EINVAL: Invalid, or not supported file descriptor + */ +int sgx_set_attribute(unsigned long *allowed_attributes, + unsigned int attribute_fd) +{ + struct file *file; + + file = fget(attribute_fd); + if (!file) + return -EINVAL; + + if (file->f_op != &sgx_provision_fops) { + fput(file); + return -EINVAL; + } + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + + fput(file); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_set_attribute); + static int __init sgx_init(void) { int ret; @@ -716,12 +806,28 @@ static int __init sgx_init(void) goto err_page_cache; } - ret = sgx_drv_init(); + ret = misc_register(&sgx_dev_provision); if (ret) goto err_kthread; + /* + * Always try to initialize the native *and* KVM drivers. + * The KVM driver is less picky than the native one and + * can function if the native one is not supported on the + * current system or fails to initialize. + * + * Error out only if both fail to initialize. + */ + ret = sgx_drv_init(); + + if (sgx_vepc_init() && ret) + goto err_provision; + return 0; +err_provision: + misc_deregister(&sgx_dev_provision); + err_kthread: kthread_stop(ksgxd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 5fa42d143feb..4628acec0009 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,11 +8,15 @@ #include <linux/rwsem.h> #include <linux/types.h> #include <asm/asm.h> -#include "arch.h" +#include <asm/sgx.h> #undef pr_fmt #define pr_fmt(fmt) "sgx: " fmt +#define EREMOVE_ERROR_MESSAGE \ + "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \ + "Refer to Documentation/x86/sgx.rst for more information." + #define SGX_MAX_EPC_SECTIONS 8 #define SGX_EEXTEND_BLOCK_SIZE 256 #define SGX_NR_TO_SCAN 16 @@ -30,28 +34,25 @@ struct sgx_epc_page { }; /* + * Contains the tracking data for NUMA nodes having EPC pages. Most importantly, + * the free page list local to the node is stored here. + */ +struct sgx_numa_node { + struct list_head free_page_list; + spinlock_t lock; +}; + +/* * The firmware can define multiple chunks of EPC to the different areas of the * physical memory e.g. for memory areas of the each node. This structure is * used to store EPC pages for one EPC section and virtual memory area where * the pages have been mapped. - * - * 'lock' must be held before accessing 'page_list' or 'free_cnt'. */ struct sgx_epc_section { unsigned long phys_addr; void *virt_addr; struct sgx_epc_page *pages; - - spinlock_t lock; - struct list_head page_list; - unsigned long free_cnt; - - /* - * Pages which need EREMOVE run on them before they can be - * used. Only safe to be accessed in ksgxd and init code. - * Not protected by locks. - */ - struct list_head init_laundry_list; + struct sgx_numa_node *node; }; extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; @@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page); int sgx_unmark_page_reclaimable(struct sgx_epc_page *page); struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim); +#ifdef CONFIG_X86_SGX_KVM +int __init sgx_vepc_init(void); +#else +static inline int __init sgx_vepc_init(void) +{ + return -ENODEV; +} +#endif + +void sgx_update_lepubkeyhash(u64 *lepubkeyhash); + #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index 000000000000..6ad165a5c0cc --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device driver to expose SGX enclave memory to KVM guests. + * + * Copyright(c) 2021 Intel Corporation. + */ + +#include <linux/miscdevice.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/xarray.h> +#include <asm/sgx.h> +#include <uapi/asm/sgx.h> + +#include "encls.h" +#include "sgx.h" + +struct sgx_vepc { + struct xarray page_array; + struct mutex lock; +}; + +/* + * Temporary SECS pages that cannot be EREMOVE'd due to having child in other + * virtual EPC instances, and the lock to protect it. + */ +static struct mutex zombie_secs_pages_lock; +static struct list_head zombie_secs_pages; + +static int __sgx_vepc_fault(struct sgx_vepc *vepc, + struct vm_area_struct *vma, unsigned long addr) +{ + struct sgx_epc_page *epc_page; + unsigned long index, pfn; + int ret; + + WARN_ON(!mutex_is_locked(&vepc->lock)); + + /* Calculate index of EPC page in virtual EPC's page_array */ + index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); + + epc_page = xa_load(&vepc->page_array, index); + if (epc_page) + return 0; + + epc_page = sgx_alloc_epc_page(vepc, false); + if (IS_ERR(epc_page)) + return PTR_ERR(epc_page); + + ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL)); + if (ret) + goto err_free; + + pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page)); + + ret = vmf_insert_pfn(vma, addr, pfn); + if (ret != VM_FAULT_NOPAGE) { + ret = -EFAULT; + goto err_delete; + } + + return 0; + +err_delete: + xa_erase(&vepc->page_array, index); +err_free: + sgx_free_epc_page(epc_page); + return ret; +} + +static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct sgx_vepc *vepc = vma->vm_private_data; + int ret; + + mutex_lock(&vepc->lock); + ret = __sgx_vepc_fault(vepc, vma, vmf->address); + mutex_unlock(&vepc->lock); + + if (!ret) + return VM_FAULT_NOPAGE; + + if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) { + mmap_read_unlock(vma->vm_mm); + return VM_FAULT_RETRY; + } + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct sgx_vepc_vm_ops = { + .fault = sgx_vepc_fault, +}; + +static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct sgx_vepc *vepc = file->private_data; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &sgx_vepc_vm_ops; + /* Don't copy VMA in fork() */ + vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY; + vma->vm_private_data = vepc; + + return 0; +} + +static int sgx_vepc_free_page(struct sgx_epc_page *epc_page) +{ + int ret; + + /* + * Take a previously guest-owned EPC page and return it to the + * general EPC page pool. + * + * Guests can not be trusted to have left this page in a good + * state, so run EREMOVE on the page unconditionally. In the + * case that a guest properly EREMOVE'd this page, a superfluous + * EREMOVE is harmless. + */ + ret = __eremove(sgx_get_epc_virt_addr(epc_page)); + if (ret) { + /* + * Only SGX_CHILD_PRESENT is expected, which is because of + * EREMOVE'ing an SECS still with child, in which case it can + * be handled by EREMOVE'ing the SECS again after all pages in + * virtual EPC have been EREMOVE'd. See comments in below in + * sgx_vepc_release(). + * + * The user of virtual EPC (KVM) needs to guarantee there's no + * logical processor is still running in the enclave in guest, + * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be + * handled here. + */ + WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE, + ret, ret); + return ret; + } + + sgx_free_epc_page(epc_page); + + return 0; +} + +static int sgx_vepc_release(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc = file->private_data; + struct sgx_epc_page *epc_page, *tmp, *entry; + unsigned long index; + + LIST_HEAD(secs_pages); + + xa_for_each(&vepc->page_array, index, entry) { + /* + * Remove all normal, child pages. sgx_vepc_free_page() + * will fail if EREMOVE fails, but this is OK and expected on + * SECS pages. Those can only be EREMOVE'd *after* all their + * child pages. Retries below will clean them up. + */ + if (sgx_vepc_free_page(entry)) + continue; + + xa_erase(&vepc->page_array, index); + } + + /* + * Retry EREMOVE'ing pages. This will clean up any SECS pages that + * only had children in this 'epc' area. + */ + xa_for_each(&vepc->page_array, index, entry) { + epc_page = entry; + /* + * An EREMOVE failure here means that the SECS page still + * has children. But, since all children in this 'sgx_vepc' + * have been removed, the SECS page must have a child on + * another instance. + */ + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + + xa_erase(&vepc->page_array, index); + } + + /* + * SECS pages are "pinned" by child pages, and "unpinned" once all + * children have been EREMOVE'd. A child page in this instance + * may have pinned an SECS page encountered in an earlier release(), + * creating a zombie. Since some children were EREMOVE'd above, + * try to EREMOVE all zombies in the hopes that one was unpinned. + */ + mutex_lock(&zombie_secs_pages_lock); + list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) { + /* + * Speculatively remove the page from the list of zombies, + * if the page is successfully EREMOVE'd it will be added to + * the list of free pages. If EREMOVE fails, throw the page + * on the local list, which will be spliced on at the end. + */ + list_del(&epc_page->list); + + if (sgx_vepc_free_page(epc_page)) + list_add_tail(&epc_page->list, &secs_pages); + } + + if (!list_empty(&secs_pages)) + list_splice_tail(&secs_pages, &zombie_secs_pages); + mutex_unlock(&zombie_secs_pages_lock); + + kfree(vepc); + + return 0; +} + +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + struct sgx_vepc *vepc; + + vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL); + if (!vepc) + return -ENOMEM; + mutex_init(&vepc->lock); + xa_init(&vepc->page_array); + + file->private_data = vepc; + + return 0; +} + +static const struct file_operations sgx_vepc_fops = { + .owner = THIS_MODULE, + .open = sgx_vepc_open, + .release = sgx_vepc_release, + .mmap = sgx_vepc_mmap, +}; + +static struct miscdevice sgx_vepc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "sgx_vepc", + .nodename = "sgx_vepc", + .fops = &sgx_vepc_fops, +}; + +int __init sgx_vepc_init(void) +{ + /* SGX virtualization requires KVM to work */ + if (!cpu_feature_enabled(X86_FEATURE_VMX)) + return -ENODEV; + + INIT_LIST_HEAD(&zombie_secs_pages); + mutex_init(&zombie_secs_pages_lock); + + return misc_register(&sgx_vepc_dev); +} + +/** + * sgx_virt_ecreate() - Run ECREATE on behalf of guest + * @pageinfo: Pointer to PAGEINFO structure + * @secs: Userspace pointer to SECS page + * @trapnr: trap number injected to guest in case of ECREATE error + * + * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose + * of enforcing policies of guest's enclaves, and return the trap number + * which should be injected to guest in case of any ECREATE error. + * + * Return: + * - 0: ECREATE was successful. + * - <0: on error. + */ +int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, + int *trapnr) +{ + int ret; + + /* + * @secs is an untrusted, userspace-provided address. It comes from + * KVM and is assumed to be a valid pointer which points somewhere in + * userspace. This can fault and call SGX or other fault handlers when + * userspace mapping @secs doesn't exist. + * + * Add a WARN() to make sure @secs is already valid userspace pointer + * from caller (KVM), who should already have handled invalid pointer + * case (for instance, made by malicious guest). All other checks, + * such as alignment of @secs, are deferred to ENCLS itself. + */ + if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + /* ECREATE doesn't return an error code, it faults or succeeds. */ + WARN_ON_ONCE(ret); + return 0; +} +EXPORT_SYMBOL_GPL(sgx_virt_ecreate); + +static int __sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + /* + * Make sure all userspace pointers from caller (KVM) are valid. + * All other checks deferred to ENCLS itself. Also see comment + * for @secs in sgx_virt_ecreate(). + */ +#define SGX_EINITTOKEN_SIZE 304 + if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) || + !access_ok(token, SGX_EINITTOKEN_SIZE) || + !access_ok(secs, PAGE_SIZE))) + return -EINVAL; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + + return ret; +} + +/** + * sgx_virt_einit() - Run EINIT on behalf of guest + * @sigstruct: Userspace pointer to SIGSTRUCT structure + * @token: Userspace pointer to EINITTOKEN structure + * @secs: Userspace pointer to SECS page + * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values + * @trapnr: trap number injected to guest in case of EINIT error + * + * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available + * in host, SGX driver may rewrite the hardware values at wish, therefore KVM + * needs to update hardware values to guest's virtual MSR values in order to + * ensure EINIT is executed with expected hardware values. + * + * Return: + * - 0: EINIT was successful. + * - <0: on error. + */ +int sgx_virt_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + ret = __sgx_virt_einit(sigstruct, token, secs); + } else { + preempt_disable(); + + sgx_update_lepubkeyhash(lepubkeyhash); + + ret = __sgx_virt_einit(sigstruct, token, secs); + preempt_enable(); + } + + /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */ + if (ret == -EINVAL) + return ret; + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8678864ce712..132a2de44d2f 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package); #ifdef CONFIG_SMP /* - * Check if given CPUID extended toplogy "leaf" is implemented + * Check if given CPUID extended topology "leaf" is implemented */ static int check_extended_topology_leaf(int leaf) { @@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf) return 0; } /* - * Return best CPUID Extended Toplogy Leaf supported + * Return best CPUID Extended Topology Leaf supported */ static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index c6ede3b3d302..c04b933f48d3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -27,6 +27,7 @@ #include <linux/clocksource.h> #include <linux/cpu.h> #include <linux/reboot.h> +#include <linux/static_call.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void) vmware_cyc2ns_setup(); if (vmw_sched_clock) - pv_ops.time.sched_clock = vmware_sched_clock; + paravirt_set_sched_clock(vmware_sched_clock); if (vmware_is_stealclock_available()) { has_steal_clock = true; - pv_ops.time.steal_clock = vmware_steal_clock; + static_call_update(pv_steal_clock, vmware_steal_clock); /* We use reboot notifier only to disable steal clock */ register_reboot_notifier(&vmware_pv_reboot_nb); @@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_tsc_khz) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) setup_force_cpu_cap(X86_FEATURE_VMCALL); else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a8f3af257e26..54ce999ed321 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -323,8 +323,8 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, cmem->nr_ranges = 1; /* Exclude elf header region */ - start = image->arch.elf_load_addr; - end = start + image->arch.elf_headers_sz - 1; + start = image->elf_load_addr; + end = start + image->elf_headers_sz - 1; return crash_exclude_mem_range(cmem, start, end); } @@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(sizeof(struct crash_mem)); + cmem = vzalloc(struct_size(cmem, ranges, 1)); if (!cmem) return -ENOMEM; @@ -407,20 +407,20 @@ int crash_load_segments(struct kimage *image) if (ret) return ret; - image->arch.elf_headers = kbuf.buffer; - image->arch.elf_headers_sz = kbuf.bufsz; + image->elf_headers = kbuf.buffer; + image->elf_headers_sz = kbuf.bufsz; kbuf.memsz = kbuf.bufsz; kbuf.buf_align = ELF_CORE_HEADER_ALIGN; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) { - vfree((void *)image->arch.elf_headers); + vfree((void *)image->elf_headers); return ret; } - image->arch.elf_load_addr = kbuf.mem; + image->elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); + image->elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 759d392cbe9f..d1d49e3d536b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -100,9 +100,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, -#ifndef CONFIG_X86_32_LAZY_GS - .gs = __KERNEL_STACK_CANARY, -#endif + .gs = 0, .__cr3 = __pa_nodebug(swapper_pg_dir), }, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 22aad412f965..bc0657f0deed 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -31,8 +31,8 @@ * - inform the user about the firmware's notion of memory layout * via /sys/firmware/memmap * - * - the hibernation code uses it to generate a kernel-independent MD5 - * fingerprint of the physical memory layout of a system. + * - the hibernation code uses it to generate a kernel-independent CRC32 + * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between @@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions); #endif /* - * Allocate the requested number of bytes with the requsted alignment + * Allocate the requested number of bytes with the requested alignment * and return (the physical address) to the caller. Also register this * range in the 'kexec' E820 table as a reserved range. * diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4b5af03dcc1..6edd1e2ee8af 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -551,6 +551,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_EHL_IDS(&gen11_early_ops), INTEL_TGL_12_IDS(&gen11_early_ops), INTEL_RKL_IDS(&gen11_early_ops), + INTEL_ADLS_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 683749b80ae2..a85c64000218 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature) static void __init setup_xstate_features(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginnning of the "extended state" */ + /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7edbd5ee5ed4..1b3ce3b4a2a2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -66,7 +66,7 @@ int ftrace_arch_code_modify_post_process(void) static const char *ftrace_nop_replace(void) { - return ideal_nops[NOP_ATOMIC5]; + return x86_nops[5]; } static const char *ftrace_call_replace(unsigned long ip, unsigned long addr) @@ -377,7 +377,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2); + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); if (ret < 0) goto fail; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5e9beb77cafd..18be44163a50 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -104,7 +104,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) static bool __head check_la57_support(unsigned long physaddr) { /* - * 5-level paging is detected and enabled at kernel decomression + * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. */ if (!(native_read_cr4() & X86_CR4_LA57)) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ed84c282233..67f590425d90 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -318,8 +318,8 @@ SYM_FUNC_START(startup_32_smp) movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu - movl $(__KERNEL_STACK_CANARY),%eax - movl %eax,%gs + xorl %eax,%eax + movl %eax,%gs # clear possible garbage in %gs xorl %eax,%eax # Clear LDT lldt %ax @@ -339,20 +339,6 @@ SYM_FUNC_END(startup_32_smp) */ __INIT setup_once: -#ifdef CONFIG_STACKPROTECTOR - /* - * Configure the stack canary. The linker can't handle this by - * relocation. Manually set base address in stack canary - * segment descriptor. - */ - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -#endif - andl $0,setup_once_ref /* Once is enough, thanks */ ret diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ee1a283f8e96..d552f177eca0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = { * after that. * * Note, that X86_64 cannot install the real #PF handler in - * idt_setup_early_traps() because the memory intialization needs the #PF + * idt_setup_early_traps() because the memory initialization needs the #PF * handler from the early_idt_handler_array to initialize the early page * tables. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 58aa712973ac..e28f6a5d14f1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -338,7 +338,7 @@ void fixup_irqs(void) irq_migrate_all_off_this_cpu(); /* - * We can remove mdelay() and then send spuriuous interrupts to + * We can remove mdelay() and then send spurious interrupts to * new cpu targets for all the irqs that were handled previously by * this cpu. While it works, I have seen spurious interrupt messages * (nothing wrong but still...). diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 5ba8477c2cb7..6a2eb62c85e6 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -28,10 +28,8 @@ static void bug_at(const void *ip, int line) } static const void * -__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init) +__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; const void *expect, *code; const void *addr, *dest; int line; @@ -41,10 +39,8 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); - if (init) { - expect = default_nop; line = __LINE__; - } else if (type == JUMP_LABEL_JMP) { - expect = ideal_nop; line = __LINE__; + if (type == JUMP_LABEL_JMP) { + expect = x86_nops[5]; line = __LINE__; } else { expect = code; line = __LINE__; } @@ -53,7 +49,7 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, bug_at(addr, line); if (type == JUMP_LABEL_NOP) - code = ideal_nop; + code = x86_nops[5]; return code; } @@ -62,7 +58,7 @@ static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { - const void *opcode = __jump_label_set_jump_code(entry, type, init); + const void *opcode = __jump_label_set_jump_code(entry, type); /* * As long as only a single processor is running and the code is still @@ -113,7 +109,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } mutex_lock(&text_mutex); - opcode = __jump_label_set_jump_code(entry, type, 0); + opcode = __jump_label_set_jump_code(entry, type); text_poke_queue((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); mutex_unlock(&text_mutex); @@ -136,22 +132,6 @@ static enum { __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - /* - * This function is called at boot up and when modules are - * first loaded. Check if the default nop, the one that is - * inserted at compile time, is the ideal nop. If it is, then - * we do not need to update the nop, and we can leave it as is. - * If it is not, then we need to update the nop to the ideal nop. - */ - if (jlstate == JL_STATE_START) { - const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; - const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - - if (memcmp(ideal_nop, default_nop, 5) != 0) - jlstate = JL_STATE_UPDATE; - else - jlstate = JL_STATE_NO_UPDATE; - } if (jlstate == JL_STATE_UPDATE) jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index ce831f9448e7..170d0fd68b1f 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -75,7 +75,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, - "elfcorehdr=0x%lx ", image->arch.elf_load_addr); + "elfcorehdr=0x%lx ", image->elf_load_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ff7878df96b4..3a43a2dee658 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -17,7 +17,7 @@ * Updated by: Tom Rini <trini@kernel.crashing.org> * Updated by: Jason Wessel <jason.wessel@windriver.com> * Modified for 386 by Jim Kingdon, Cygnus Support. - * Origianl kgdb, compatibility with 2.1.xx kernel by + * Original kgdb, compatibility with 2.1.xx kernel by * David Grothe <dave@gcom.com> * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> * X86_64 changes from Andi Kleen's patch merged by Jim Houston @@ -642,7 +642,7 @@ void kgdb_arch_late(void) struct perf_event **pevent; /* - * Pre-allocate the hw breakpoint structions in the non-atomic + * Pre-allocate the hw breakpoint instructions in the non-atomic * portion of kgdb because this operation requires mutexs to * complete. */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index df776cdca327..d3d65545cb8b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -139,6 +139,8 @@ NOKPROBE_SYMBOL(synthesize_relcall); int can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; + insn_byte_t prefix; + int i; if (search_exception_tables((unsigned long)addr)) return 0; /* Page fault may occur on this address. */ @@ -151,35 +153,39 @@ int can_boost(struct insn *insn, void *addr) if (insn->opcode.nbytes != 1) return 0; - /* Can't boost Address-size override prefix */ - if (unlikely(inat_is_address_size_prefix(insn->attr))) - return 0; + for_each_insn_prefix(insn, i, prefix) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute(prefix); + /* Can't boost Address-size override prefix and CS override prefix */ + if (prefix == 0x2e || inat_is_address_size_prefix(attr)) + return 0; + } opcode = insn->opcode.bytes[0]; - switch (opcode & 0xf0) { - case 0x60: - /* can't boost "bound" */ - return (opcode != 0x62); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0x90: - return opcode != 0x9a; /* can't boost call far */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - /* clear and set flags are boostable */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + switch (opcode) { + case 0x62: /* bound */ + case 0x70 ... 0x7f: /* Conditional jumps */ + case 0x9a: /* Call far */ + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xcc ... 0xce: /* software exceptions */ + case 0xd0 ... 0xd3: /* Grp2 */ + case 0xd6: /* (UD) */ + case 0xd8 ... 0xdf: /* ESC */ + case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ + case 0xe8 ... 0xe9: /* near Call, JMP */ + case 0xeb: /* Short JMP */ + case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + case 0xf6 ... 0xf7: /* Grp3 */ + case 0xfe: /* Grp4 */ + /* ... are not boostable */ + return 0; + case 0xff: /* Grp5 */ + /* Only indirect jmp is boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; default: - /* CS override prefix and call are not boostable */ - return (opcode != 0x2e && opcode != 0x9a); + return 1; } } @@ -229,7 +235,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) return 0UL; if (faddr) - memcpy(buf, ideal_nops[NOP_ATOMIC5], 5); + memcpy(buf, x86_nops[5], 5); else buf[0] = kp->opcode; return (unsigned long)buf; @@ -265,6 +271,8 @@ static int can_probe(unsigned long paddr) /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { + int ret; + /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the @@ -276,8 +284,10 @@ static int can_probe(unsigned long paddr) __addr = recover_probed_instruction(buf, addr); if (!__addr) return 0; - kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); - insn_get_length(&insn); + + ret = insn_decode_kernel(&insn, (void *)__addr); + if (ret < 0) + return 0; /* * Another debugging subsystem might insert this breakpoint. @@ -301,8 +311,8 @@ static int can_probe(unsigned long paddr) int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) { kprobe_opcode_t buf[MAX_INSN_SIZE]; - unsigned long recovered_insn = - recover_probed_instruction(buf, (unsigned long)src); + unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src); + int ret; if (!recovered_insn || !insn) return 0; @@ -312,8 +322,9 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) MAX_INSN_SIZE)) return 0; - kernel_insn_init(insn, dest, MAX_INSN_SIZE); - insn_get_length(insn); + ret = insn_decode_kernel(insn, dest); + if (ret < 0) + return 0; /* We can not probe force emulate prefixed instruction */ if (insn_has_emulate_prefix(insn)) @@ -357,13 +368,14 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) return insn->length; } -/* Prepare reljump right after instruction to boost */ -static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, - struct insn *insn) +/* Prepare reljump or int3 right after instruction */ +static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p, + struct insn *insn) { int len = insn->length; - if (can_boost(insn, p->addr) && + if (!IS_ENABLED(CONFIG_PREEMPTION) && + !p->post_handler && can_boost(insn, p->addr) && MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) { /* * These instructions can be executed directly if it @@ -374,7 +386,12 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p, len += JMP32_INSN_SIZE; p->ainsn.boostable = 1; } else { - p->ainsn.boostable = 0; + /* Otherwise, put an int3 for trapping singlestep */ + if (MAX_INSN_SIZE - len < INT3_INSN_SIZE) + return -ENOSPC; + + buf[len] = INT3_INSN_OPCODE; + len += INT3_INSN_SIZE; } return len; @@ -411,86 +428,290 @@ void free_insn_page(void *page) module_memfree(page); } -static void set_resume_flags(struct kprobe *p, struct insn *insn) +/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ + +static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) +{ + switch (p->ainsn.opcode) { + case 0xfa: /* cli */ + regs->flags &= ~(X86_EFLAGS_IF); + break; + case 0xfb: /* sti */ + regs->flags |= X86_EFLAGS_IF; + break; + case 0x9c: /* pushf */ + int3_emulate_push(regs, regs->flags); + break; + case 0x9d: /* popf */ + regs->flags = int3_emulate_pop(regs); + break; + } + regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; +} +NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers); + +static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs) +{ + int3_emulate_ret(regs); +} +NOKPROBE_SYMBOL(kprobe_emulate_ret); + +static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + func += p->ainsn.rel32; + int3_emulate_call(regs, func); +} +NOKPROBE_SYMBOL(kprobe_emulate_call); + +static nokprobe_inline +void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond) +{ + unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size; + + if (cond) + ip += p->ainsn.rel32; + int3_emulate_jmp(regs, ip); +} + +static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs) +{ + __kprobe_emulate_jmp(p, regs, true); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp); + +static const unsigned long jcc_mask[6] = { + [0] = X86_EFLAGS_OF, + [1] = X86_EFLAGS_CF, + [2] = X86_EFLAGS_ZF, + [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, + [4] = X86_EFLAGS_SF, + [5] = X86_EFLAGS_PF, +}; + +static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs) +{ + bool invert = p->ainsn.jcc.type & 1; + bool match; + + if (p->ainsn.jcc.type < 0xc) { + match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1]; + } else { + match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ + ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); + if (p->ainsn.jcc.type >= 0xe) + match = match && (regs->flags & X86_EFLAGS_ZF); + } + __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jcc); + +static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs) +{ + bool match; + + if (p->ainsn.loop.type != 3) { /* LOOP* */ + if (p->ainsn.loop.asize == 32) + match = ((*(u32 *)®s->cx)--) != 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = ((*(u64 *)®s->cx)--) != 0; +#endif + else + match = ((*(u16 *)®s->cx)--) != 0; + } else { /* JCXZ */ + if (p->ainsn.loop.asize == 32) + match = *(u32 *)(®s->cx) == 0; +#ifdef CONFIG_X86_64 + else if (p->ainsn.loop.asize == 64) + match = *(u64 *)(®s->cx) == 0; +#endif + else + match = *(u16 *)(®s->cx) == 0; + } + + if (p->ainsn.loop.type == 0) /* LOOPNE */ + match = match && !(regs->flags & X86_EFLAGS_ZF); + else if (p->ainsn.loop.type == 1) /* LOOPE */ + match = match && (regs->flags & X86_EFLAGS_ZF); + + __kprobe_emulate_jmp(p, regs, match); +} +NOKPROBE_SYMBOL(kprobe_emulate_loop); + +static const int addrmode_regoffs[] = { + offsetof(struct pt_regs, ax), + offsetof(struct pt_regs, cx), + offsetof(struct pt_regs, dx), + offsetof(struct pt_regs, bx), + offsetof(struct pt_regs, sp), + offsetof(struct pt_regs, bp), + offsetof(struct pt_regs, si), + offsetof(struct pt_regs, di), +#ifdef CONFIG_X86_64 + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), +#endif +}; + +static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_call(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_call_indirect); + +static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg]; + + int3_emulate_jmp(regs, regs_get_register(regs, offs)); +} +NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect); + +static int prepare_emulation(struct kprobe *p, struct insn *insn) { insn_byte_t opcode = insn->opcode.bytes[0]; switch (opcode) { case 0xfa: /* cli */ case 0xfb: /* sti */ + case 0x9c: /* pushfl */ case 0x9d: /* popf/popfd */ - /* Check whether the instruction modifies Interrupt Flag or not */ - p->ainsn.if_modifier = 1; - break; - case 0x9c: /* pushfl */ - p->ainsn.is_pushf = 1; + /* + * IF modifiers must be emulated since it will enable interrupt while + * int3 single stepping. + */ + p->ainsn.emulate_op = kprobe_emulate_ifmodifiers; + p->ainsn.opcode = opcode; break; - case 0xcf: /* iret */ - p->ainsn.if_modifier = 1; - fallthrough; case 0xc2: /* ret/lret */ case 0xc3: case 0xca: case 0xcb: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - p->ainsn.is_abs_ip = 1; - /* Without resume jump, this is boostable */ - p->ainsn.boostable = 1; + p->ainsn.emulate_op = kprobe_emulate_ret; break; - case 0xe8: /* call relative - Fix return addr */ - p->ainsn.is_call = 1; + case 0x9a: /* far call absolute -- segment is not supported */ + case 0xea: /* far jmp absolute -- segment is not supported */ + case 0xcc: /* int3 */ + case 0xcf: /* iret -- in-kernel IRET is not supported */ + return -EOPNOTSUPP; break; -#ifdef CONFIG_X86_32 - case 0x9a: /* call absolute -- same as call absolute, indirect */ - p->ainsn.is_call = 1; - p->ainsn.is_abs_ip = 1; + case 0xe8: /* near call relative */ + p->ainsn.emulate_op = kprobe_emulate_call; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; break; -#endif - case 0xff: + case 0xeb: /* short jump relative */ + case 0xe9: /* near jump relative */ + p->ainsn.emulate_op = kprobe_emulate_jmp; + if (insn->immediate.nbytes == 1) + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + else if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + break; + case 0x70 ... 0x7f: + /* 1 byte conditional jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + p->ainsn.rel32 = *(char *)insn->immediate.bytes; + break; + case 0x0f: opcode = insn->opcode.bytes[1]; + if ((opcode & 0xf0) == 0x80) { + /* 2 bytes Conditional Jump */ + p->ainsn.emulate_op = kprobe_emulate_jcc; + p->ainsn.jcc.type = opcode & 0xf; + if (insn->immediate.nbytes == 2) + p->ainsn.rel32 = *(s16 *)&insn->immediate.value; + else + p->ainsn.rel32 = *(s32 *)&insn->immediate.value; + } else if (opcode == 0x01 && + X86_MODRM_REG(insn->modrm.bytes[0]) == 0 && + X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) { + /* VM extensions - not supported */ + return -EOPNOTSUPP; + } + break; + case 0xe0: /* Loop NZ */ + case 0xe1: /* Loop */ + case 0xe2: /* Loop */ + case 0xe3: /* J*CXZ */ + p->ainsn.emulate_op = kprobe_emulate_loop; + p->ainsn.loop.type = opcode & 0x3; + p->ainsn.loop.asize = insn->addr_bytes * 8; + p->ainsn.rel32 = *(s8 *)&insn->immediate.value; + break; + case 0xff: + /* + * Since the 0xff is an extended group opcode, the instruction + * is determined by the MOD/RM byte. + */ + opcode = insn->modrm.bytes[0]; if ((opcode & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; ip is correct. - * But this is not boostable - */ - p->ainsn.is_call = 1; - p->ainsn.is_abs_ip = 1; + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far call */ + /* call absolute, indirect */ + p->ainsn.emulate_op = kprobe_emulate_call_indirect; + } else if ((opcode & 0x30) == 0x20) { + if ((opcode & 0x8) == 0x8) + return -EOPNOTSUPP; /* far jmp */ + /* jmp near absolute indirect */ + p->ainsn.emulate_op = kprobe_emulate_jmp_indirect; + } else break; - } else if (((opcode & 0x31) == 0x20) || - ((opcode & 0x31) == 0x21)) { - /* - * jmp near and far, absolute indirect - * ip is correct. - */ - p->ainsn.is_abs_ip = 1; - /* Without resume jump, this is boostable */ - p->ainsn.boostable = 1; - } + + if (insn->addr_bytes != sizeof(unsigned long)) + return -EOPNOTSUPP; /* Don't support differnt size */ + if (X86_MODRM_MOD(opcode) != 3) + return -EOPNOTSUPP; /* TODO: support memory addressing */ + + p->ainsn.indirect.reg = X86_MODRM_RM(opcode); +#ifdef CONFIG_X86_64 + if (X86_REX_B(insn->rex_prefix.value)) + p->ainsn.indirect.reg += 8; +#endif + break; + default: break; } + p->ainsn.size = insn->length; + + return 0; } static int arch_copy_kprobe(struct kprobe *p) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - int len; + int ret, len; /* Copy an instruction with recovering if other optprobe modifies it.*/ len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn); if (!len) return -EINVAL; - /* - * __copy_instruction can modify the displacement of the instruction, - * but it doesn't affect boostable check. - */ - len = prepare_boost(buf, p, &insn); + /* Analyze the opcode and setup emulate functions */ + ret = prepare_emulation(p, &insn); + if (ret < 0) + return ret; - /* Analyze the opcode and set resume flags */ - set_resume_flags(p, &insn); + /* Add int3 for single-step or booster jmp */ + len = prepare_singlestep(buf, p, &insn); + if (len < 0) + return len; /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; @@ -583,29 +804,7 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, { __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags - = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (p->ainsn.if_modifier) - kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; -} - -static nokprobe_inline void clear_btf(void) -{ - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - } -} - -static nokprobe_inline void restore_btf(void) -{ - if (test_thread_flag(TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl |= DEBUGCTLMSR_BTF; - update_debugctlmsr(debugctl); - } + = (regs->flags & X86_EFLAGS_IF); } void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) @@ -620,6 +819,22 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); +static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); +} +NOKPROBE_SYMBOL(kprobe_post_process); + static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { @@ -627,7 +842,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, return; #if !defined(CONFIG_PREEMPTION) - if (p->ainsn.boostable && !p->post_handler) { + if (p->ainsn.boostable) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) reset_current_kprobe(); @@ -646,19 +861,51 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; } else kcb->kprobe_status = KPROBE_HIT_SS; - /* Prepare real single stepping */ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; + + if (p->ainsn.emulate_op) { + p->ainsn.emulate_op(p, regs); + kprobe_post_process(p, regs, kcb); + return; + } + + /* Disable interrupt, and set ip register on trampoline */ regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == INT3_INSN_OPCODE) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; + regs->ip = (unsigned long)p->ainsn.insn; } NOKPROBE_SYMBOL(setup_singlestep); /* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again + * right after the copied instruction. + * Different from the trap single-step, "int3" single-step can not + * handle the instruction which changes the ip register, e.g. jmp, + * call, conditional jmp, and the instructions which changes the IF + * flags because interrupt must be disabled around the single-stepping. + * Such instructions are software emulated, but others are single-stepped + * using "int3". + * + * When the 2nd "int3" handled, the regs->ip and regs->flags needs to + * be adjusted, so that we can resume execution on correct code. + */ +static void resume_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + + /* Restore saved interrupt flag and ip register */ + regs->flags |= kcb->kprobe_saved_flags; + /* Note that regs->ip is executed int3 so must be a step back */ + regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE; +} +NOKPROBE_SYMBOL(resume_singlestep); + +/* * We have reentered the kprobe_handler(), since another probe was hit while * within the handler. We save the original kprobes variables and just single * step on the instruction of the new probe without calling any user handlers. @@ -693,6 +940,12 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(reenter_kprobe); +static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb) +{ + return (kcb->kprobe_status == KPROBE_HIT_SS || + kcb->kprobe_status == KPROBE_REENTER); +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -737,7 +990,18 @@ int kprobe_int3_handler(struct pt_regs *regs) reset_current_kprobe(); return 1; } - } else if (*addr != INT3_INSN_OPCODE) { + } else if (kprobe_is_ss(kcb)) { + p = kprobe_running(); + if ((unsigned long)p->ainsn.insn < regs->ip && + (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) { + /* Most provably this is the second int3 for singlestep */ + resume_singlestep(p, regs, kcb); + kprobe_post_process(p, regs, kcb); + return 1; + } + } + + if (*addr != INT3_INSN_OPCODE) { /* * The breakpoint instruction was removed right * after we hit it. Another cpu has removed @@ -810,91 +1074,6 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) } NOKPROBE_SYMBOL(trampoline_handler); -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new ip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed flags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void resume_execution(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = stack_addr(regs); - unsigned long copy_ip = (unsigned long)p->ainsn.insn; - unsigned long orig_ip = (unsigned long)p->addr; - - regs->flags &= ~X86_EFLAGS_TF; - - /* Fixup the contents of top of stack */ - if (p->ainsn.is_pushf) { - *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); - *tos |= kcb->kprobe_old_flags; - } else if (p->ainsn.is_call) { - *tos = orig_ip + (*tos - copy_ip); - } - - if (!p->ainsn.is_abs_ip) - regs->ip += orig_ip - copy_ip; - - restore_btf(); -} -NOKPROBE_SYMBOL(resume_execution); - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled throughout this function. - */ -int kprobe_debug_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - resume_execution(cur, regs, kcb); - regs->flags |= kcb->kprobe_saved_flags; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - /* Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - /* - * if somebody else is singlestepping across a probe point, flags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; - - return 1; -} -NOKPROBE_SYMBOL(kprobe_debug_handler); - int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); @@ -912,20 +1091,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) * normal page fault. */ regs->ip = (unsigned long)cur->addr; - /* - * Trap flag (TF) has been set here because this fault - * happened where the single stepping will be done. - * So clear it by resetting the current kprobe: - */ - regs->flags &= ~X86_EFLAGS_TF; - /* - * Since the single step (trap) has been cancelled, - * we need to restore BTF here. - */ - restore_btf(); /* - * If the TF flag was set before the kprobe hit, + * If the IF flag was set before the kprobe hit, * don't touch it: */ regs->flags |= kcb->kprobe_old_flags; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 51c7f5271aee..596de2f6d3a5 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabled */ +/* Ftrace callback handler for kprobes -- called under preempt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 08eb23074f92..71425ebba98a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -312,6 +312,8 @@ static int can_optimize(unsigned long paddr) addr = paddr - offset; while (addr < paddr - offset + size) { /* Decode until function end */ unsigned long recovered_insn; + int ret; + if (search_exception_tables(addr)) /* * Since some fixup code will jumps into this function, @@ -321,8 +323,11 @@ static int can_optimize(unsigned long paddr) recovered_insn = recover_probed_instruction(buf, addr); if (!recovered_insn) return 0; - kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); - insn_get_length(&insn); + + ret = insn_decode_kernel(&insn, (void *)recovered_insn); + if (ret < 0) + return 0; + /* * In the case of detecting unknown breakpoint, this could be * a padding INT3 between functions. Let's check that all the diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 78bb0fae3982..172c947240b9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -650,7 +650,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_ops.time.steal_clock = kvm_steal_clock; + static_call_update(pv_steal_clock, kvm_steal_clock); } if (pv_tlb_flush_supported()) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1fc0962c89c0..d37ed4e1d033 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable) if (!stable) clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - pv_ops.time.sched_clock = kvm_sched_clock_read; + paravirt_set_sched_clock(kvm_sched_clock_read); pr_info("kvm-clock: using sched offset of %llu cycles", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a29a44a98e5b..c078b0d3ab0e 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -260,7 +260,7 @@ static void set_idt(void *newidt, u16 limit) { struct desc_ptr curidt; - /* x86-64 supports unaliged loads & stores */ + /* x86-64 supports unaligned loads & stores */ curidt.size = limit; curidt.address = (unsigned long)newidt; @@ -402,8 +402,8 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_KEXEC_FILE void *arch_kexec_kernel_image_load(struct kimage *image) { - vfree(image->arch.elf_headers); - image->arch.elf_headers = NULL; + vfree(image->elf_headers); + image->elf_headers = NULL; if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 4f75d0cf6305..9e1ea99ad9df 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void) return pv_ops.lock.vcpu_is_preempted.func == __raw_callee_save___native_vcpu_is_preempted; } + +void __init paravirt_set_cap(void) +{ + if (!pv_is_native_spin_unlock()) + setup_force_cpu_cap(X86_FEATURE_PVUNLOCK); + + if (!pv_is_native_vcpu_is_preempted()) + setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT); +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c60222ab8ab9..d0730264786b 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -14,6 +14,7 @@ #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> +#include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> @@ -52,7 +53,10 @@ void __init default_banner(void) } /* Undefined instruction for dealing with missing ops pointers. */ -static const unsigned char ud2a[] = { 0x0f, 0x0b }; +static void paravirt_BUG(void) +{ + BUG(); +} struct branch { unsigned char opcode; @@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x) { return x; } - -static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - struct branch *b = insn_buff; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ - } - - b->opcode = 0xe9; /* jmp */ - b->delta = delta; - - return 5; -} #endif DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); @@ -114,8 +99,8 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insn_buff, - unsigned long addr, unsigned len) +unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) { /* * Neat trick to map patch type back to the call within the @@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned ret; if (opfunc == NULL) - /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); + /* If there's no function, patch it with paravirt_BUG() */ + ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); else if (opfunc == _paravirt_nop) ret = 0; - -#ifdef CONFIG_PARAVIRT_XXL - /* identity functions just return their single argument */ - else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insn_buff, len); - - else if (type == PARAVIRT_PATCH(cpu.iret)) - /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); -#endif else /* Otherwise call the function. */ ret = paravirt_patch_call(insn_buff, opfunc, addr, len); @@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff, return ret; } -unsigned paravirt_patch_insns(void *insn_buff, unsigned len, - const char *start, const char *end) -{ - unsigned insn_len = end - start; - - /* Alternative instruction is too large for the patch site and we cannot continue: */ - BUG_ON(insn_len > len || start == NULL); - - memcpy(insn_buff, start, insn_len); - - return insn_len; -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu) return 0; } +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); +DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); + +void paravirt_set_sched_clock(u64 (*func)(void)) +{ + static_call_update(pv_sched_clock, func); +} + /* These are in entry.S */ extern void native_iret(void); @@ -269,13 +239,6 @@ struct pv_info pv_info = { #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { - /* Init ops. */ - .init.patch = native_patch, - - /* Time ops. */ - .time.sched_clock = native_sched_clock, - .time.steal_clock = native_steal_clock, - /* Cpu ops. */ .cpu.io_delay = native_io_delay, @@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = { .cpu.load_sp0 = native_load_sp0, - .cpu.iret = native_iret, - #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, @@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = { NOKPROBE_SYMBOL(native_get_debugreg); NOKPROBE_SYMBOL(native_set_debugreg); NOKPROBE_SYMBOL(native_load_idt); + +void (*paravirt_iret)(void) = native_iret; #endif EXPORT_SYMBOL(pv_ops); diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c deleted file mode 100644 index abd27ec67397..000000000000 --- a/arch/x86/kernel/paravirt_patch.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/stringify.h> - -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> - -#define PSTART(d, m) \ - patch_data_##d.m - -#define PEND(d, m) \ - (PSTART(d, m) + sizeof(patch_data_##d.m)) - -#define PATCH(d, m, insn_buff, len) \ - paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) - -#define PATCH_CASE(ops, m, data, insn_buff, len) \ - case PARAVIRT_PATCH(ops.m): \ - return PATCH(data, ops##_##m, insn_buff, len) - -#ifdef CONFIG_PARAVIRT_XXL -struct patch_xxl { - const unsigned char irq_irq_disable[1]; - const unsigned char irq_irq_enable[1]; - const unsigned char irq_save_fl[2]; - const unsigned char mmu_read_cr2[3]; - const unsigned char mmu_read_cr3[3]; - const unsigned char mmu_write_cr3[3]; - const unsigned char cpu_wbinvd[2]; - const unsigned char mov64[3]; -}; - -static const struct patch_xxl patch_data_xxl = { - .irq_irq_disable = { 0xfa }, // cli - .irq_irq_enable = { 0xfb }, // sti - .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax - .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax - .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax - .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 - .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd - .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax -}; - -unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) -{ - return PATCH(xxl, mov64, insn_buff, len); -} -# endif /* CONFIG_PARAVIRT_XXL */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -struct patch_lock { - unsigned char queued_spin_unlock[3]; - unsigned char vcpu_is_preempted[2]; -}; - -static const struct patch_lock patch_data_lock = { - .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax - -# ifdef CONFIG_X86_64 - .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) -# else - .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) -# endif -}; -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - -unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) -{ - switch (type) { - -#ifdef CONFIG_PARAVIRT_XXL - PATCH_CASE(irq, save_fl, xxl, insn_buff, len); - PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); - PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); - - PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); - PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); - PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); - - PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); -#endif - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return PATCH(lock, queued_spin_unlock, insn_buff, len); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return PATCH(lock, vcpu_is_preempted, insn_buff, len); - break; -#endif - default: - break; - } - - return paravirt_patch_default(type, insn_buff, addr, len); -} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9c214d7085a4..43cbfc84153a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, - /* - * .sp1 is cpu_current_top_of_stack. The init task never - * runs user code, but cpu_current_top_of_stack should still - * be well defined before the first context switch. - */ +#ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, -#ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif @@ -451,7 +446,7 @@ void speculative_store_bypass_ht_init(void) * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link - * themself to the state of this CPU. + * themselves to the state of this CPU. */ st->shared_state = st; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 11065dc03f5b..eda37df016f0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) /* * Assumption here is that last_value, a global accumulator, always goes * forward. If we are less than that, we should not be much smaller. - * We assume there is an error marging we're inside, and then the correction + * We assume there is an error margin we're inside, and then the correction * does not sacrifice accuracy. * * For reads: global may have changed between test and return, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 94b33885f8d2..f469153eca8a 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movl %cr0, %eax andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a4d9a261425b..c53271aebb64 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -121,7 +121,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * - Write protect disabled * - No task switch * - Don't do FP software emulation. - * - Proctected mode enabled + * - Protected mode enabled */ movq %cr0, %rax andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d883176ef2ce..72920af0b3c0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -65,7 +65,7 @@ RESERVE_BRK(dmi_alloc, 65536); /* * Range of the BSS area. The size of the BSS area is determined - * at link time, with RESERVE_BRK*() facility reserving additional + * at link time, with RESERVE_BRK() facility reserving additional * chunks. */ unsigned long _brk_start = (unsigned long)__brk_base; @@ -633,11 +633,16 @@ static void __init trim_snb_memory(void) printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); /* - * Reserve all memory below the 1 MB mark that has not - * already been reserved. + * SandyBridge integrated graphics devices have a bug that prevents + * them from accessing certain memory ranges, namely anything below + * 1M and in the pages listed in bad_pages[] above. + * + * To avoid these pages being ever accessed by SNB gfx devices + * reserve all memory below the 1 MB mark and bad_pages that have + * not already been reserved at boot time. */ memblock_reserve(0, 1<<20); - + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) printk(KERN_WARNING "failed to reserve 0x%08lx\n", @@ -645,18 +650,6 @@ static void __init trim_snb_memory(void) } } -/* - * Here we put platform-specific memory range workarounds, i.e. - * memory known to be corrupt or otherwise in need to be reserved on - * specific platforms. - * - * If this gets used more widely it could use a real dispatch mechanism. - */ -static void __init trim_platform_memory_ranges(void) -{ - trim_snb_memory(); -} - static void __init trim_bios_range(void) { /* @@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static void __init trim_low_memory_range(void) +static void __init early_reserve_memory(void) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + + /* + * The first 4Kb of memory is a BIOS owned area, but generally it is + * not listed as such in the E820 table. + * + * Reserve the first memory page and typically some additional + * memory (64KiB by default) since some BIOSes are known to corrupt + * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * + * In addition, make sure page 0 is always reserved because on + * systems with L1TF its contents can be leaked to user processes. + */ memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + + early_reserve_initrd(); + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); + + memblock_x86_reserve_range_setup_data(); + + reserve_ibft_region(); + reserve_bios_regions(); } - + /* * Dump out kernel offset information on panic. */ @@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { - /* - * Reserve the memory occupied by the kernel between _text and - * __end_of_kernel_reserve symbols. Any kernel sections after the - * __end_of_kernel_reserve symbol must be explicitly reserved with a - * separate memblock_reserve() or they will be discarded. - */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); - - /* - * Make sure page 0 is always reserved because on systems with - * L1TF its contents can be leaked to user processes. - */ - memblock_reserve(0, PAGE_SIZE); - - early_reserve_initrd(); - - /* - * At this point everything still needed from the boot loader - * or BIOS or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -822,7 +822,6 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); - arch_init_ideal_nops(); jump_label_init(); static_call_init(); early_ioremap_init(); @@ -910,8 +909,18 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - if (efi_enabled(EFI_BOOT)) - efi_memblock_x86_reserve_range(); + /* + * Do some memory reservations *before* memory is added to + * memblock, so memblock allocations won't overwrite it. + * Do it after early param, so we could get (unlikely) panic from + * serial. + * + * After this point everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + early_reserve_memory(); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux @@ -938,9 +947,6 @@ void __init setup_arch(char **cmdline_p) x86_report_nx(); - /* after early param, so could get panic from serial */ - memblock_x86_reserve_range_setup_data(); - if (acpi_mps_check()) { #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; @@ -1032,14 +1038,12 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); - reserve_ibft_region(); - early_alloc_pgt_buf(); /* * Need to conclude brk, before e820__memblock_setup() - * it could use memblock_find_in_range, could overlap with - * brk area. + * it could use memblock_find_in_range, could overlap with + * brk area. */ reserve_brk(); @@ -1054,8 +1058,6 @@ void __init setup_arch(char **cmdline_p) */ sev_setup_arch(); - reserve_bios_regions(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); @@ -1081,8 +1083,12 @@ void __init setup_arch(char **cmdline_p) reserve_real_mode(); - trim_platform_memory_ranges(); - trim_low_memory_range(); + /* + * Reserving memory causing GPU hangs on Sandy Bridge integrated + * graphics devices should be done after we allocated memory under + * 1M for the real mode trampoline. + */ + trim_snb_memory(); init_mem_mapping(); @@ -1129,6 +1135,8 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); acpi_table_upgrade(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); vsmp_init(); @@ -1136,11 +1144,6 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index fd945ce78554..0941d2f44f2a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -224,7 +224,6 @@ void __init setup_per_cpu_areas(void) per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); - setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index cdc04d091242..0aa9f13efd57 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int reason) { u64 val = GHCB_SEV_TERMINATE; @@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * make it accessible to the hypervisor. * * In particular, check for: - * - Hypervisor CPUID bit * - Availability of CPUID leaf 0x8000001f * - SEV CPUID bit. * @@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * can't be checked here. */ - if ((fn == 1 && !(regs->cx & BIT(31)))) - /* Hypervisor bit */ - goto fail; - else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) /* SEV leaf check */ goto fail; else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) @@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) return; fail: - sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); - VMGEXIT(); - - /* Shouldn't get here - if we do halt the machine */ - while (true) - asm volatile("hlt\n"); + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 04a780abb512..73873b007838 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs) } /* - * This function handles the case when an NMI is raised in the #VC exception - * handler entry code. In this case, the IST entry for #VC must be adjusted, so - * that any subsequent #VC exception will not overwrite the stack contents of the - * interrupted #VC handler. + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. * * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested - * sev_es_ist_exit() call may adjust back the IST entry too early. + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. */ void noinstr __sev_es_ist_enter(struct pt_regs *regs) { unsigned long old_ist, new_ist; /* Read old IST entry */ - old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - /* Make room on the IST stack */ + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ if (on_vc_stack(regs)) - new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); - else - new_ist = old_ist - sizeof(old_ist); + new_ist = regs->sp; - /* Store old IST entry */ + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); *(unsigned long *)new_ist = old_ist; /* Set new IST entry */ @@ -251,39 +263,54 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); } -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) { char buffer[MAX_INSN_SIZE]; - enum es_result ret; int res; - if (user_mode(ctxt->regs)) { - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } - if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res)) - return ES_DECODE_FAILED; - } else { - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); - insn_get_length(&ctxt->insn); + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; } - ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED; + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} - return ret; +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); } static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f306e85a08a6..a06cb107c0e8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -492,7 +492,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, * SS descriptor, but we do need SS to be valid. It's possible * that the old SS is entirely bogus -- this can happen if the * signal we're trying to deliver is #GP or #SS caused by a bad - * SS value. We also have a compatbility issue here: DOSEMU + * SS value. We also have a compatibility issue here: DOSEMU * relies on the contents of the SS register indicating the * SS value at the time of the signal, even though that code in * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a5330ff498f0..0e5d0a7e203b 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 5); + BUILD_BUG_ON(NSIGTRAP != 6); BUILD_BUG_ON(NSIGCHLD != 6); BUILD_BUG_ON(NSIGSYS != 2); @@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index eff4ce3b10da..06db901fabe8 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -67,7 +67,7 @@ * 5AP. symmetric IO mode (normal Linux operation) not affected. * 'noapic' mode has vector 0xf filled out properly. * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs + * 7AP. We do not assume writes to the LVT deasserting IRQs * 8AP. We do not enable low power mode (deep sleep) during MP bootup * 9AP. We do not use mixed mode * @@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait) } /* * Don't wait longer than 10 ms if the caller didn't - * reqeust it. If wait is true, the machine hangs here if + * request it. If wait is true, the machine hangs here if * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02813a7f3a7c..7ffb0cf3f997 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id) + return true; + return false; +} + /* - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. + * Unlike the other levels, we do not enforce keeping a + * multicore group inside a NUMA node. If this happens, we will + * discard the MC level of the topology later. + */ +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return true; + return false; +} + +/* + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. * - * These are Intel CPUs that enumerate an LLC that is shared by - * multiple NUMA nodes. The LLC on these systems is shared for - * off-package data access but private to the NUMA node (half - * of the package) for on-package access. + * Any Intel CPU that has multiple nodes per package and does not + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. * - * CPUID (the source of the information about the LLC) can only - * enumerate the cache as being shared *or* unshared, but not - * this particular configuration. The CPU in this case enumerates - * the cache to be shared across the entire package (spanning both - * NUMA nodes). + * When in SNC mode, these CPUs enumerate an LLC that is shared + * by multiple NUMA nodes. The LLC is shared for off-package data + * access but private to the NUMA node (half of the package) for + * on-package access. CPUID (the source of the information about + * the LLC) can only enumerate the cache as shared or unshared, + * but not this particular configuration. */ -static const struct x86_cpu_id snc_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL), +static const struct x86_cpu_id intel_cod_cpu[] = { + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ {} }; static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + bool intel_snc = id && id->driver_data; /* Do not match if we do not have a valid APICID for cpu: */ if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) @@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * means 'c' does not share the LLC of 'o'. This will be * reflected to userspace. */ - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) return false; return topology_sane(c, o, "llc"); } -/* - * Unlike the other levels, we do not enforce keeping a - * multicore group inside a NUMA node. If this happens, we will - * discard the MC level of the topology later. - */ -static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if (c->phys_proc_id == o->phys_proc_id) - return true; - return false; -} - -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) -{ - if ((c->phys_proc_id == o->phys_proc_id) && - (c->cpu_die_id == o->cpu_die_id)) - return true; - return false; -} - #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) @@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); + if (match_pkg(c, o) && !topology_same_node(c, o)) + x86_has_numa_in_package = true; + if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); if ((i == cpu) || (has_mp && match_llc(c, o))) link_mask(cpu_llc_shared_mask, cpu, i); + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); } + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; + /* * This needs a separate iteration over the cpus because we rely on all * topology_sibling_cpumask links to be set-up. @@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu) /* * Does this new cpu bringup a new core? */ - if (cpumask_weight( - topology_sibling_cpumask(cpu)) == 1) { + if (threads == 1) { /* * for each core in package, increment * the booted_cores for this new cpu @@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; - - if ((i == cpu) || (has_mp && match_die(c, o))) - link_mask(topology_die_cpumask, cpu, i); } - - threads = cpumask_weight(topology_sibling_cpumask(cpu)); - if (threads > __max_smt_threads) - __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -1407,7 +1409,7 @@ void __init calculate_max_logical_packages(void) int ncpus; /* - * Today neither Intel nor AMD support heterogenous systems so + * Today neither Intel nor AMD support heterogeneous systems so * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); @@ -1659,13 +1661,17 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +/** + * cond_wakeup_cpu0 - Wake up CPU0 if needed. + * + * If NMI wants to wake up CPU0, start CPU0. + */ +void cond_wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) - return true; - - return false; + start_cpu0(); } +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); /* * We need to flush the caches before going to sleep, lest we have @@ -1734,11 +1740,8 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } @@ -1749,11 +1752,8 @@ void hlt_play_dead(void) while (1) { native_halt(); - /* - * If NMI wants to wake up CPU0, start CPU0. - */ - if (wakeup_cpu0()) - start_cpu0(); + + cond_wakeup_cpu0(); } } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 8627fda8d993..15b058eefc4e 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, } } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 9442c4136c38..ea028e736831 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -34,7 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void break; case NOP: - code = ideal_nops[NOP_ATOMIC5]; + code = x86_nops[5]; break; case JMP: @@ -66,7 +66,7 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, x86_nops[5], 5) || !memcmp(insn, xor5rax, 5)) return; } diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 653b7f617b61..8a56a6d80098 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -10,7 +10,7 @@ * EFI Quirks * Several EFI systems do not correctly advertise their boot framebuffers. * Hence, we use this static table of known broken machines and fix up the - * information so framebuffer drivers can load corectly. + * information so framebuffer drivers can load correctly. */ #include <linux/dmi.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c09ba110204..f9af561c3cd4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -49,6 +49,30 @@ bool tboot_enabled(void) return tboot != NULL; } +/* noinline to prevent gcc from warning about dereferencing constant fixaddr */ +static noinline __init bool check_tboot_version(void) +{ + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + return false; + } + + if (tboot->version < 5) { + pr_warn("tboot version is invalid: %u\n", tboot->version); + return false; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); + + return true; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ @@ -66,25 +90,9 @@ void __init tboot_probe(void) /* Map and check for tboot UUID. */ set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); - tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); - if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); + tboot = (void *)fix_to_virt(FIX_TBOOT_BASE); + if (!check_tboot_version()) tboot = NULL; - return; - } - if (tboot->version < 5) { - pr_warn("tboot version is invalid: %u\n", tboot->version); - tboot = NULL; - return; - } - - pr_info("found shared page at phys addr 0x%llx:\n", - boot_params.tboot_addr); - pr_debug("version: %d\n", tboot->version); - pr_debug("log_addr: 0x%08x\n", tboot->log_addr); - pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); - pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); - pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); } static pgd_t *tboot_pg_dir; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 64a496a0687f..3c883e064242 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -164,17 +164,11 @@ int do_set_thread_area(struct task_struct *p, int idx, savesegment(fs, sel); if (sel == modified_sel) loadsegment(fs, sel); - - savesegment(gs, sel); - if (sel == modified_sel) - load_gs_index(sel); #endif -#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, sel); if (sel == modified_sel) - loadsegment(gs, sel); -#endif + load_gs_index(sel); } else { #ifdef CONFIG_X86_64 if (p->thread.fsindex == modified_sel) diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index f5477eab5692..bd83748e2bde 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -113,7 +113,7 @@ int arch_register_cpu(int num) * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate * depends on BSP. PIC interrupts depend on BSP. * - * If the BSP depencies are under control, one can tell kernel to + * If the BSP dependencies are under control, one can tell kernel to * enable BSP hotplug. This basically adds a control file and * one can attempt to offline BSP. */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ac1874a2a70e..853ea7a80806 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -395,7 +395,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) /* * Adjust our frame so that we return straight to the #GP * vector with the expected RSP value. This is safe because - * we won't enable interupts or schedule before we invoke + * we won't enable interrupts or schedule before we invoke * general_protection, so nothing will clobber the stack * frame we just set up. * @@ -498,14 +498,15 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, { u8 insn_buf[MAX_INSN_SIZE]; struct insn insn; + int ret; if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE)) return GP_NO_HINT; - kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE); - insn_get_modrm(&insn); - insn_get_sib(&insn); + ret = insn_decode_kernel(&insn, insn_buf); + if (ret < 0) + return GP_NO_HINT; *addr = (unsigned long)insn_get_addr_ref(&insn, regs); if (*addr == -1UL) @@ -556,7 +557,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) tsk->thread.trap_nr = X86_TRAP_GP; if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); @@ -889,9 +890,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - if (kprobe_debug_handler(regs)) - goto out; - /* * The kernel doesn't use INT1 */ @@ -978,6 +976,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, goto out_irq; } + /* #DB for bus lock can only be triggered from userspace. */ + if (dr6 & DR_BUS_LOCK) + handle_bus_lock(regs); + /* Add the virtual_dr6 bits for signals. */ dr6 |= current->thread.virtual_dr6; if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp) @@ -1057,7 +1059,7 @@ static void math_error(struct pt_regs *regs, int trapnr) goto exit; if (fixup_vdso_exception(regs, trapnr, 0, 0)) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f70dffc2771f..57ec01192180 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -14,6 +14,7 @@ #include <linux/percpu.h> #include <linux/timex.h> #include <linux/static_key.h> +#include <linux/static_call.h> #include <asm/hpet.h> #include <asm/timer.h> @@ -254,7 +255,7 @@ unsigned long long sched_clock(void) bool using_native_sched_clock(void) { - return pv_ops.time.sched_clock == native_sched_clock; + return static_call_query(pv_sched_clock) == native_sched_clock; } #else unsigned long long @@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void) * 2) Reference counter. If available we use the HPET or the * PMTIMER as a reference to check the sanity of that value. * We use separate TSC readouts and check inside of the - * reference read for any possible disturbance. We dicard + * reference read for any possible disturbance. We discard * disturbed values here as well. We do that around the PIT * calibration delay loop as we have to wait for a certain * amount of time anyway. @@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs) * very small window right after one CPU updated cycle_last under * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which * is smaller than the cycle_last reference value due to a TSC which - * is slighty behind. This delta is nowhere else observable, but in + * is slightly behind. This delta is nowhere else observable, but in * that case it results in a forward time jump in the range of hours * due to the unsigned delta calculation of the time keeping core * code, which is necessary to support wrapping clocksources like pm @@ -1264,7 +1265,7 @@ EXPORT_SYMBOL(convert_art_to_tsc); * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparibility of two cycle + * by timekeeping code to verify comparability of two cycle * values. */ diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 3d3c761eb74a..50a4515fe0ad 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -472,7 +472,7 @@ retry: /* * Add the result to the previous adjustment value. * - * The adjustement value is slightly off by the overhead of the + * The adjustment value is slightly off by the overhead of the * sync mechanism (observed values are ~200 TSC cycles), but this * really depends on CPU, node distance and frequency. So * compensating for this is hard to get right. Experiments show diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f6225bf22c02..8daa70b0d2da 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * by whether the operand is a register or a memory location. * If operand is a register, return as many bytes as the operand * size. If operand is memory, return only the two least - * siginificant bytes. + * significant bytes. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) *data_size = insn->opnd_bytes; @@ -356,7 +356,7 @@ bool fixup_umip_exception(struct pt_regs *regs) if (!nr_copied) return false; - if (!insn_decode(&insn, regs, buf, nr_copied)) + if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) return false; umip_inst = identify_insn(&insn); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index a2b413394917..b63cf8f7745e 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -276,12 +276,12 @@ static bool is_prefix_bad(struct insn *insn) static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) { + enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32; u32 volatile *good_insns; + int ret; - insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); - /* has the side-effect of processing the entire instruction */ - insn_get_length(insn); - if (!insn_complete(insn)) + ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m); + if (ret < 0) return -ENOEXEC; if (is_prefix_bad(insn)) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index a788d5120d4d..f6b93a35ce14 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -84,6 +84,18 @@ config KVM_INTEL To compile this as a module, choose M here: the module will be called kvm-intel. +config X86_SGX_KVM + bool "Software Guard eXtensions (SGX) Virtualization" + depends on X86_SGX && KVM_INTEL + help + + Enables KVM guests to create SGX enclaves. + + This includes support to expose "raw" unreclaimable enclave memory to + guests via a device node, e.g. /dev/sgx_vepc. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1b4766fe1de2..eafc4d601f25 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y += -Iarch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror ifeq ($(CONFIG_FRAME_POINTER),y) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6bd2f8b830e4..c02466a1410b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1033,7 +1033,7 @@ EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); * - Centaur: 0xc0000000 - 0xcfffffff * * The Hypervisor class is further subdivided into sub-classes that each act as - * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu + * their own independent class associated with a 0x100 byte range. E.g. if Qemu * is advertising support for both HyperV and KVM, the resulting Hypervisor * CPUID sub-classes are: * diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f7970ba6219f..cdd2a2b6550e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3222,7 +3222,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, } /* - * Now load segment descriptors. If fault happenes at this stage + * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8a4de3f12820..d5b72a08e566 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -269,7 +269,7 @@ int kvm_set_routing_entry(struct kvm *kvm, const struct kvm_irq_routing_entry *ue) { /* We can't check irqchip_in_kernel() here as some callers are - * currently inititalizing the irqchip. Other callers should therefore + * currently initializing the irqchip. Other callers should therefore * check kvm_arch_can_set_irq_routing() before calling this function. */ switch (ue->type) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d75524bc8423..62b1729277ef 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4961,7 +4961,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * No need to care whether allocation memory is successful - * or not since pte prefetch is skiped if it does not have + * or not since pte prefetch is skipped if it does not have * enough objects in the cache. */ mmu_topup_memory_caches(vcpu, true); @@ -5884,6 +5884,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5905,19 +5906,19 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + flush |= kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 1f6f98c76bdf..360983865398 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -59,7 +59,7 @@ struct kvm_mmu_page { #ifdef CONFIG_X86_64 bool tdp_mmu_page; - /* Used for freeing the page asyncronously if it is a TDP MMU page. */ + /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif }; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 462b1f71c77f..34207b874886 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -404,7 +404,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * If this warning were to trigger it would indicate that there was a * missing MMU notifier or a race with some notifier handler. * A present, leaf SPTE should never be directly replaced with another - * present leaf SPTE pointing to a differnt PFN. A notifier handler + * present leaf SPTE pointing to a different PFN. A notifier handler * should be zapping the SPTE before the main MM's page table is * changed, or the SPTE should be zeroed, and the TLBs flushed by the * thread before replacement. @@ -418,7 +418,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, /* * Crash the host to prevent error propagation and guest data - * courruption. + * corruption. */ BUG(); } @@ -529,7 +529,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, /* * No other thread can overwrite the removed SPTE as they * must either wait on the MMU lock or use - * tdp_mmu_set_spte_atomic which will not overrite the + * tdp_mmu_set_spte_atomic which will not overwrite the * special removed SPTE value. No bookkeeping is needed * here since the SPTE is going from non-present * to non-present. @@ -668,20 +668,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { - flush_needed = false; + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -699,11 +700,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = true; + flush = true; } rcu_read_unlock(); - return flush_needed; + return flush; } /* @@ -712,13 +713,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush |= zap_gfn_range(kvm, root, start, end, true); + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } @@ -930,7 +932,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b761c111bff..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,7 +8,29 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7b30bc967af3..67e753edfa22 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -103,7 +103,7 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a - * paramenter to tell them apart. + * parameter to tell them apart. */ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, u32 base) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 78bdcfac4e40..3e55674098be 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -727,7 +727,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; /** - * In some cases, the existing irte is updaed and re-set, + * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ @@ -838,7 +838,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * Here, we setup with legacy mode in the following cases: * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. - * 3. APIC virtialization is disabled for the vcpu. + * 3. APIC virtualization is disabled for the vcpu. * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 35891d9a1099..fb204eaa8bb3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -246,11 +246,18 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { struct kvm_vcpu *vcpu = &svm->vcpu; bool vmcb12_lma; + /* + * FIXME: these should be done after copying the fields, + * to avoid TOC/TOU races. For these save area checks + * the possible damage is limited since kvm_set_cr0 and + * kvm_set_cr4 handle failure; EFER_SVME is an exception + * so it is force-set later in nested_prepare_vmcb_save. + */ if ((vmcb12->save.efer & EFER_SVME) == 0) return false; @@ -271,7 +278,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb12->control); + return true; } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -396,7 +403,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); - svm_set_efer(&svm->vcpu, vmcb12->save.efer); + + /* + * Force-set EFER_SVME even though it is checked earlier on the + * VMCB12, because the guest can flip the bit between the check + * and now. Clearing EFER_SVME would call svm_free_nested. + */ + svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -468,7 +482,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, svm->nested.vmcb12_gpa = vmcb12_gpa; - load_nested_vmcb_control(svm, &vmcb12->control); nested_prepare_vmcb_control(svm); nested_prepare_vmcb_save(svm, vmcb12); @@ -515,7 +528,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - if (!nested_vmcb_checks(svm, vmcb12)) { + load_nested_vmcb_control(svm, &vmcb12->control); + + if (!nested_vmcb_check_save(svm, vmcb12) || + !nested_vmcb_check_controls(&svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; @@ -1209,6 +1225,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (!(save->cr0 & X86_CR0_PG)) goto out_free; + if (!(save->efer & EFER_SVME)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 035da07500e8..fdf587f19c5f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr) static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTL3: case MSR_F15H_PERF_CTL4: case MSR_F15H_PERF_CTL5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; @@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTR3: case MSR_F15H_PERF_CTR4: case MSR_F15H_PERF_CTR5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 874ea309279f..415a49b8b8f8 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -14,6 +14,7 @@ #include <linux/psp-sev.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> #include <asm/fpu/internal.h> @@ -28,6 +29,21 @@ #define __ex(x) __kvm_handle_fault_on_reboot(x) +#ifndef CONFIG_KVM_AMD_SEV +/* + * When this config is not defined, SEV feature is not supported and APIs in + * this file are not used but this file still gets compiled into the KVM AMD + * module. + * + * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum + * misc_res_type {} defined in linux/misc_cgroup.h. + * + * Below macros allow compilation to succeed. + */ +#define MISC_CG_RES_SEV MISC_CG_RES_TYPES +#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES +#endif + static u8 sev_enc_bit; static int sev_flush_asids(void); static DECLARE_RWSEM(sev_deactivate_lock); @@ -89,8 +105,19 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) static int sev_asid_new(struct kvm_sev_info *sev) { - int pos, min_asid, max_asid; + int pos, min_asid, max_asid, ret; bool retry = true; + enum misc_res_type type; + + type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + WARN_ON(sev->misc_cg); + sev->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(type, sev->misc_cg, 1); + if (ret) { + put_misc_cg(sev->misc_cg); + sev->misc_cg = NULL; + return ret; + } mutex_lock(&sev_bitmap_lock); @@ -108,7 +135,8 @@ again: goto again; } mutex_unlock(&sev_bitmap_lock); - return -EBUSY; + ret = -EBUSY; + goto e_uncharge; } __set_bit(pos, sev_asid_bitmap); @@ -116,6 +144,11 @@ again: mutex_unlock(&sev_bitmap_lock); return pos + 1; +e_uncharge: + misc_cg_uncharge(type, sev->misc_cg, 1); + put_misc_cg(sev->misc_cg); + sev->misc_cg = NULL; + return ret; } static int sev_get_asid(struct kvm *kvm) @@ -125,14 +158,15 @@ static int sev_get_asid(struct kvm *kvm) return sev->asid; } -static void sev_asid_free(int asid) +static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; int cpu, pos; + enum misc_res_type type; mutex_lock(&sev_bitmap_lock); - pos = asid - 1; + pos = sev->asid - 1; __set_bit(pos, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { @@ -141,6 +175,11 @@ static void sev_asid_free(int asid) } mutex_unlock(&sev_bitmap_lock); + + type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + misc_cg_uncharge(type, sev->misc_cg, 1); + put_misc_cg(sev->misc_cg); + sev->misc_cg = NULL; } static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) @@ -188,19 +227,20 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) asid = sev_asid_new(sev); if (asid < 0) return ret; + sev->asid = asid; ret = sev_platform_init(&argp->error); if (ret) goto e_free; sev->active = true; - sev->asid = asid; INIT_LIST_HEAD(&sev->regions_list); return 0; e_free: - sev_asid_free(asid); + sev_asid_free(sev); + sev->asid = 0; return ret; } @@ -1315,12 +1355,12 @@ void sev_vm_destroy(struct kvm *kvm) mutex_unlock(&kvm->lock); sev_unbind_asid(kvm, sev->handle); - sev_asid_free(sev->asid); + sev_asid_free(sev); } void __init sev_hardware_setup(void) { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; bool sev_es_supported = false; bool sev_supported = false; @@ -1352,7 +1392,11 @@ void __init sev_hardware_setup(void) if (!sev_reclaim_asid_bitmap) goto out; - pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1); + sev_asid_count = max_sev_asid - min_sev_asid + 1; + if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) + goto out; + + pr_info("SEV supported: %u ASIDs\n", sev_asid_count); sev_supported = true; /* SEV-ES support requested? */ @@ -1367,7 +1411,11 @@ void __init sev_hardware_setup(void) if (min_sev_asid == 1) goto out; - pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1); + sev_es_asid_count = min_sev_asid - 1; + if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) + goto out; + + pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count); sev_es_supported = true; out: @@ -1382,6 +1430,8 @@ void sev_hardware_teardown(void) bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); + misc_cg_set_capacity(MISC_CG_RES_SEV, 0); + misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0); sev_flush_asids(); } @@ -2082,7 +2132,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu) hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - /* PKRU is restored on VMEXIT, save the curent host value */ + /* PKRU is restored on VMEXIT, save the current host value */ hostsa->pkru = read_pkru(); /* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58a45bb139f8..6dad89248312 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4400,7 +4400,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i * * This happens because CPU microcode reading instruction bytes * uses a special opcode which attempts to read data using CPL=0 - * priviledges. The microcode reads CS:RIP and if it hits a SMAP + * privileges. The microcode reads CS:RIP and if it hits a SMAP * fault, it gives up and returns no instruction bytes. * * Detection: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 39e071fdab0c..9806aaebc37f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -65,6 +65,7 @@ struct kvm_sev_info { unsigned long pages_locked; /* Number of pages locked */ struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ + struct misc_cg *misc_cg; /* For misc cgroup accounting */ }; struct kvm_svm { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcca0b80e0d0..1e069aac7410 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3537,7 +3537,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * snapshot restore (migration). * * In this flow, it is assumed that vmcs12 cache was - * trasferred as part of captured nVMX state and should + * transferred as part of captured nVMX state and should * therefore not be read from guest memory (which may not * exist on destination host yet). */ diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 4831bc44ce66..459748680daf 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -10,7 +10,7 @@ #include "vmx.h" /* - * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we + * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we * can find which vCPU should be waken up. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32cf8287d4a7..bcbf0d2139e9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1529,7 +1529,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) /* * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that - * utilize encodings marked reserved will casue a #GP fault. + * utilize encodings marked reserved will cause a #GP fault. */ value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && @@ -2761,7 +2761,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Update real mode segment cache. It may be not up-to-date if sement + * Update real mode segment cache. It may be not up-to-date if segment * register was written while vcpu was in a guest mode. */ vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); @@ -6027,19 +6027,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH)) { + int ndata = 3; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.ndata++; - vcpu->run->internal.data[3] = + vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } - vcpu->run->internal.data[vcpu->run->internal.ndata++] = - vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + vcpu->run->internal.ndata = ndata; return 0; } @@ -7252,7 +7252,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; - /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe806e894212..efc7a82ab140 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -156,9 +156,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancment of 1000ns. '0' disables + * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows priveleged userspace to set an exact advancement time. + * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); @@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -1288,7 +1287,7 @@ static const u32 emulated_msrs_all[] = { MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, @@ -1373,7 +1372,7 @@ static u64 kvm_get_arch_capabilities(void) /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; @@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1620,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1658,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } @@ -1850,7 +1849,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) ret = EXIT_FASTPATH_EXIT_HANDLED; } break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: data = kvm_read_edx_eax(vcpu); if (!handle_fastpath_set_tscdeadline(vcpu, data)) { kvm_skip_emulated_instruction(vcpu); @@ -2329,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2337,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2559,13 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + unsigned long flags; kvm_hv_invalidate_tsc_page(kvm); - spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2573,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } @@ -2582,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2686,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3086,7 +3087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -3448,7 +3449,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: @@ -4024,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { struct kvm_host_map map; struct kvm_steal_time *st; - int idx; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4032,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, &vcpu->arch.st.cache, true)) - goto out; + return; st = map.hva + offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); @@ -4048,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); - -out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + if (vcpu->preempted && !vcpu->arch.guest_state_protected) vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); if (kvm_xen_msr_enabled(vcpu->kvm)) kvm_xen_runstate_set_preempted(vcpu); else kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -5726,6 +5725,7 @@ set_pit2_out: } #endif case KVM_SET_CLOCK: { + struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; @@ -5744,8 +5744,22 @@ set_pit2_out: * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'user_ns.clock' is very small. + */ + spin_lock_irq(&ka->pvclock_gtod_sync_lock); + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = user_ns.clock - now_ns; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } @@ -7724,6 +7738,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; + unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7739,17 +7754,15 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); - + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); } mutex_unlock(&kvm_lock); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39eb04887141..9035e34aa156 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 3b6544111ac9..16bc9130e7a5 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> /* if you want SMP support, implement these with real spinlocks */ .macro LOCK reg diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index 1c5c81c16b06..ce6935690766 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> .macro read64 reg movl %ebx, %eax diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 2402d4c489d2..db4b4f9197c7 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 77b9b2a3b5c8..57b79c577496 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -11,7 +11,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 12539fca75c4..b0f3b2a62ae2 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -4,7 +4,7 @@ * * Written by Masami Hiramatsu <mhiramat@redhat.com> */ -#include <asm/insn.h> +#include <asm/insn.h> /* __ignore_sync_check__ */ /* Attribute tables are generated from opcode map */ #include "inat-tables.c" diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index bb0b3fe1e0a0..a67afd74232c 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -232,7 +232,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) * resolve_seg_reg() - obtain segment register index * @insn: Instruction with operands * @regs: Register values as seen when entering kernel mode - * @regoff: Operand offset, in pt_regs, used to deterimine segment register + * @regoff: Operand offset, in pt_regs, used to determine segment register * * Determine the segment register associated with the operands and, if * applicable, prefixes and the instruction pointed by @insn. @@ -404,10 +404,6 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) case INAT_SEG_REG_FS: return (unsigned short)(regs->fs & 0xffff); case INAT_SEG_REG_GS: - /* - * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS. - * The macro below takes care of both cases. - */ return get_user_gs(regs); case INAT_SEG_REG_IGNORE: default: @@ -517,7 +513,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, * @insn: Instruction containing ModRM byte * @regs: Register values as seen when entering kernel mode * @offs1: Offset of the first operand register - * @offs2: Offset of the second opeand register, if applicable + * @offs2: Offset of the second operand register, if applicable * * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte * in @insn. This function is to be used with 16-bit address encodings. The @@ -576,7 +572,7 @@ static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs, * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement- * only addressing. This means that no registers are involved in * computing the effective address. Thus, ensure that the first - * register offset is invalild. The second register offset is already + * register offset is invalid. The second register offset is already * invalid under the aforementioned conditions. */ if ((X86_MODRM_MOD(insn->modrm.value) == 0) && @@ -928,10 +924,11 @@ static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs, static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs, int *regoff, long *eff_addr) { - insn_get_modrm(insn); + int ret; - if (!insn->modrm.nbytes) - return -EINVAL; + ret = insn_get_modrm(insn); + if (ret) + return ret; if (X86_MODRM_MOD(insn->modrm.value) != 3) return -EINVAL; @@ -977,14 +974,14 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs, int *regoff, long *eff_addr) { long tmp; + int ret; if (insn->addr_bytes != 8 && insn->addr_bytes != 4) return -EINVAL; - insn_get_modrm(insn); - - if (!insn->modrm.nbytes) - return -EINVAL; + ret = insn_get_modrm(insn); + if (ret) + return ret; if (X86_MODRM_MOD(insn->modrm.value) > 2) return -EINVAL; @@ -1106,18 +1103,21 @@ static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs, * @base_offset will have a register, as an offset from the base of pt_regs, * that can be used to resolve the associated segment. * - * -EINVAL on error. + * Negative value on error. */ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, int *base_offset, long *eff_addr) { long base, indx; int indx_offset; + int ret; if (insn->addr_bytes != 8 && insn->addr_bytes != 4) return -EINVAL; - insn_get_modrm(insn); + ret = insn_get_modrm(insn); + if (ret) + return ret; if (!insn->modrm.nbytes) return -EINVAL; @@ -1125,7 +1125,9 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs, if (X86_MODRM_MOD(insn->modrm.value) > 2) return -EINVAL; - insn_get_sib(insn); + ret = insn_get_sib(insn); + if (ret) + return ret; if (!insn->sib.nbytes) return -EINVAL; @@ -1194,8 +1196,8 @@ static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs) short eff_addr; long tmp; - insn_get_modrm(insn); - insn_get_displacement(insn); + if (insn_get_displacement(insn)) + goto out; if (insn->addr_bytes != 2) goto out; @@ -1492,7 +1494,7 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN } /** - * insn_decode() - Decode an instruction + * insn_decode_from_regs() - Decode an instruction * @insn: Structure to store decoded instruction * @regs: Structure with register values as seen when entering kernel mode * @buf: Buffer containing the instruction bytes @@ -1505,8 +1507,8 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN * * True if instruction was decoded, False otherwise. */ -bool insn_decode(struct insn *insn, struct pt_regs *regs, - unsigned char buf[MAX_INSN_SIZE], int buf_size) +bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs, + unsigned char buf[MAX_INSN_SIZE], int buf_size) { int seg_defs; @@ -1529,7 +1531,9 @@ bool insn_decode(struct insn *insn, struct pt_regs *regs, insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs); insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs); - insn_get_length(insn); + if (insn_get_length(insn)) + return false; + if (buf_size < insn->length) return false; diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 435630a6ec97..058f19b20465 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -11,10 +11,13 @@ #else #include <string.h> #endif -#include <asm/inat.h> -#include <asm/insn.h> +#include <asm/inat.h> /*__ignore_sync_check__ */ +#include <asm/insn.h> /* __ignore_sync_check__ */ -#include <asm/emulate_prefix.h> +#include <linux/errno.h> +#include <linux/kconfig.h> + +#include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ @@ -51,6 +54,7 @@ * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) @@ -111,8 +115,12 @@ static void insn_get_emulate_prefix(struct insn *insn) * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. + * + * * Returns: + * 0: on success + * < 0: on error */ -void insn_get_prefixes(struct insn *insn) +int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; @@ -120,7 +128,7 @@ void insn_get_prefixes(struct insn *insn) int i, nb; if (prefixes->got) - return; + return 0; insn_get_emulate_prefix(insn); @@ -230,8 +238,10 @@ vex_end: prefixes->got = 1; + return 0; + err_out: - return; + return -ENODATA; } /** @@ -243,16 +253,25 @@ err_out: * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_opcode(struct insn *insn) +int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; + int pfx_id, ret; insn_byte_t op; - int pfx_id; + if (opcode->got) - return; - if (!insn->prefixes.got) - insn_get_prefixes(insn); + return 0; + + if (!insn->prefixes.got) { + ret = insn_get_prefixes(insn); + if (ret) + return ret; + } /* Get first opcode */ op = get_next(insn_byte_t, insn); @@ -267,9 +286,13 @@ void insn_get_opcode(struct insn *insn) insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && - !inat_is_group(insn->attr))) - insn->attr = 0; /* This instruction is bad */ - goto end; /* VEX has only 1 byte for opcode */ + !inat_is_group(insn->attr))) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } + /* VEX has only 1 byte for opcode */ + goto end; } insn->attr = inat_get_opcode_attribute(op); @@ -280,13 +303,18 @@ void insn_get_opcode(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } - if (inat_must_vex(insn->attr)) - insn->attr = 0; /* This instruction is bad */ + + if (inat_must_vex(insn->attr)) { + /* This instruction is bad */ + insn->attr = 0; + return -EINVAL; + } end: opcode->got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -296,15 +324,25 @@ err_out: * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_modrm(struct insn *insn) +int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; + int ret; + if (modrm->got) - return; - if (!insn->opcode.got) - insn_get_opcode(insn); + return 0; + + if (!insn->opcode.got) { + ret = insn_get_opcode(insn); + if (ret) + return ret; + } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); @@ -313,17 +351,22 @@ void insn_get_modrm(struct insn *insn) pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); - if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) - insn->attr = 0; /* This is bad */ + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { + /* Bad insn */ + insn->attr = 0; + return -EINVAL; + } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; + modrm->got = 1; + return 0; err_out: - return; + return -ENODATA; } @@ -337,11 +380,16 @@ err_out: int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; + int ret; if (!insn->x86_64) return 0; - if (!modrm->got) - insn_get_modrm(insn); + + if (!modrm->got) { + ret = insn_get_modrm(insn); + if (ret) + return 0; + } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. @@ -355,15 +403,25 @@ int insn_rip_relative(struct insn *insn) * * If necessary, first collects the instruction up to and including the * ModRM byte. + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_sib(struct insn *insn) +int insn_get_sib(struct insn *insn) { insn_byte_t modrm; + int ret; if (insn->sib.got) - return; - if (!insn->modrm.got) - insn_get_modrm(insn); + return 0; + + if (!insn->modrm.got) { + ret = insn_get_modrm(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && @@ -374,8 +432,10 @@ void insn_get_sib(struct insn *insn) } insn->sib.got = 1; + return 0; + err_out: - return; + return -ENODATA; } @@ -386,15 +446,25 @@ err_out: * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. + * + * * Returns: + * 0: if decoding succeeded + * < 0: otherwise. */ -void insn_get_displacement(struct insn *insn) +int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; + int ret; if (insn->displacement.got) - return; - if (!insn->sib.got) - insn_get_sib(insn); + return 0; + + if (!insn->sib.got) { + ret = insn_get_sib(insn); + if (ret) + return ret; + } + if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: @@ -436,9 +506,10 @@ void insn_get_displacement(struct insn *insn) } out: insn->displacement.got = 1; + return 0; err_out: - return; + return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ @@ -537,20 +608,30 @@ err_out: } /** - * insn_get_immediate() - Get the immediates of instruction + * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be - * get by bit masking with ((1 << (nbytes * 8)) - 1) + * computed by bit masking with ((1 << (nbytes * 8)) - 1) + * + * Returns: + * 0: on success + * < 0: on error */ -void insn_get_immediate(struct insn *insn) +int insn_get_immediate(struct insn *insn) { + int ret; + if (insn->immediate.got) - return; - if (!insn->displacement.got) - insn_get_displacement(insn); + return 0; + + if (!insn->displacement.got) { + ret = insn_get_displacement(insn); + if (ret) + return ret; + } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) @@ -597,9 +678,10 @@ void insn_get_immediate(struct insn *insn) } done: insn->immediate.got = 1; + return 0; err_out: - return; + return -ENODATA; } /** @@ -608,13 +690,65 @@ err_out: * * If necessary, first collects the instruction up to and including the * immediates bytes. - */ -void insn_get_length(struct insn *insn) + * + * Returns: + * - 0 on success + * - < 0 on error +*/ +int insn_get_length(struct insn *insn) { + int ret; + if (insn->length) - return; - if (!insn->immediate.got) - insn_get_immediate(insn); + return 0; + + if (!insn->immediate.got) { + ret = insn_get_immediate(insn); + if (ret) + return ret; + } + insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); + + return 0; +} + +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + +/** + * insn_decode() - Decode an x86 instruction + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @buf_len: length of the insn buffer at @kaddr + * @m: insn mode, see enum insn_mode + * + * Returns: + * 0: if decoding succeeded + * < 0: otherwise. + */ +int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) +{ + int ret; + +/* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ + + if (m == INSN_MODE_KERN) + insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); + else + insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); + + ret = insn_get_length(insn); + if (ret) + return ret; + + if (insn_complete(insn)) + return 0; + + return -EINVAL; } diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1e299ac73c86..1cc9da6e29c7 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -4,7 +4,7 @@ #include <linux/linkage.h> #include <asm/errno.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> .pushsection .noinstr.text, "ax" diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 41902fe8b859..64801010d312 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -8,7 +8,7 @@ */ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #undef memmove diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 0bfd26e4ca9e..9827ae267f96 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -3,7 +3,7 @@ #include <linux/linkage.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> /* diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 419365c48b2a..cc5f4ea943d3 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -14,7 +14,7 @@ * tested so far for any MMX solution figured. * * 22/09/2000 - Arjan van de Ven - * Improved for non-egineering-sample Athlons + * Improved for non-engineering-sample Athlons * */ #include <linux/hardirq.h> diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 75a0915b0d01..40bbe56bde32 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -252,7 +252,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info) rv->err = wrmsr_safe_regs(rv->regs); } -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; @@ -265,7 +265,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 3bd905e10ee2..b09cd2ad426c 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(msrs_free); * argument @m. * */ -int msr_read(u32 msr, struct msr *m) +static int msr_read(u32 msr, struct msr *m) { int err; u64 val; @@ -54,7 +54,7 @@ int msr_read(u32 msr, struct msr *m) * @msr: MSR to write * @m: value to write */ -int msr_write(u32 msr, struct msr *m) +static int msr_write(u32 msr, struct msr *m) { return wrmsrl_safe(msr, m->q); } diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index f6fb1d218dcc..4d32cb06ffd5 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -4,33 +4,65 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeatures.h> -#include <asm/alternative-asm.h> +#include <asm/alternative.h> #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> #include <asm/frame.h> -.macro THUNK reg .section .text.__x86.indirect_thunk - .align 32 -SYM_FUNC_START(__x86_indirect_thunk_\reg) - JMP_NOSPEC \reg -SYM_FUNC_END(__x86_indirect_thunk_\reg) - -SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) +.macro RETPOLINE reg ANNOTATE_INTRA_FUNCTION_CALL - call .Ldo_rop_\@ + call .Ldo_rop_\@ .Lspec_trap_\@: UNWIND_HINT_EMPTY pause lfence - jmp .Lspec_trap_\@ + jmp .Lspec_trap_\@ .Ldo_rop_\@: - mov %\reg, (%_ASM_SP) + mov %\reg, (%_ASM_SP) UNWIND_HINT_FUNC ret -SYM_FUNC_END(__x86_retpoline_\reg) +.endm + +.macro THUNK reg + + .align 32 + +SYM_FUNC_START(__x86_indirect_thunk_\reg) + + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD + +SYM_FUNC_END(__x86_indirect_thunk_\reg) + +.endm + +/* + * This generates .altinstr_replacement symbols for use by objtool. They, + * however, must not actually live in .altinstr_replacement since that will be + * discarded after init, but module alternatives will also reference these + * symbols. + * + * Their names matches the "__x86_indirect_" prefix to mark them as retpolines. + */ +.macro ALT_THUNK reg + + .align 1 + +SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) + ANNOTATE_RETPOLINE_SAFE +1: call *%\reg +2: .skip 5-(2b-1b), 0x90 +SYM_FUNC_END(__x86_indirect_alt_call_\reg) + +SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) + ANNOTATE_RETPOLINE_SAFE +1: jmp *%\reg +2: .skip 5-(2b-1b), 0x90 +SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) .endm @@ -48,7 +80,6 @@ SYM_FUNC_END(__x86_retpoline_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) #undef GEN #define GEN(reg) THUNK reg @@ -59,5 +90,13 @@ SYM_FUNC_END(__x86_retpoline_\reg) #include <asm/GEN-for-each-reg.h> #undef GEN -#define GEN(reg) EXPORT_RETPOLINE(reg) +#define GEN(reg) ALT_THUNK reg +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg) #include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 4a9887851ad8..990d847ae902 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -547,7 +547,7 @@ static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) single_arg_error(st0_ptr, st0_tag); } -static int fsin(FPU_REG *st0_ptr, u_char tag) +static int f_sin(FPU_REG *st0_ptr, u_char tag) { u_char arg_sign = getsign(st0_ptr); @@ -608,6 +608,11 @@ static int fsin(FPU_REG *st0_ptr, u_char tag) } } +static void fsin(FPU_REG *st0_ptr, u_char tag) +{ + f_sin(st0_ptr, tag); +} + static int f_cos(FPU_REG *st0_ptr, u_char tag) { u_char st0_sign; @@ -724,7 +729,7 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) } reg_copy(st0_ptr, &arg); - if (!fsin(st0_ptr, st0_tag)) { + if (!f_sin(st0_ptr, st0_tag)) { push(); FPU_copy_to_reg0(&arg, st0_tag); f_cos(&st(0), st0_tag); @@ -1635,7 +1640,7 @@ void FPU_triga(void) } static FUNC_ST0 const trig_table_b[] = { - fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, fsin, fcos }; void FPU_trigb(void) diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index fe6246ff9887..7ca6417c0c8d 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -964,7 +964,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) /* The return value (in eax) is zero if the result is exact, if bits are changed due to rounding, truncation, etc, then a non-zero value is returned */ -/* Overflow is signalled by a non-zero return value (in eax). +/* Overflow is signaled by a non-zero return value (in eax). In the case of overflow, the returned significand always has the largest possible value */ int FPU_round_to_int(FPU_REG *r, u_char tag) diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S index 11a1f798451b..4a9fc3cc5a4d 100644 --- a/arch/x86/math-emu/reg_round.S +++ b/arch/x86/math-emu/reg_round.S @@ -575,7 +575,7 @@ Normalise_result: #ifdef PECULIAR_486 /* * This implements a special feature of 80486 behaviour. - * Underflow will be signalled even if the number is + * Underflow will be signaled even if the number is * not a denormal after rounding. * This difference occurs only for masked underflow, and not * in the unmasked case. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a73347e2cdfc..1c548ad00752 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1497,7 +1497,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * userspace task is trying to access some valid (from guest's point of * view) memory which is not currently mapped by the host (e.g. the * memory is swapped out). Note, the corresponding "page ready" event - * which is injected when the memory becomes available, is delived via + * which is injected when the memory becomes available, is delivered via * an interrupt mechanism and not a #PF exception * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). * @@ -1523,7 +1523,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * * In case the fault hit a RCU idle region the conditional entry * code reenabled RCU to avoid subsequent wreckage which helps - * debugability. + * debuggability. */ state = irqentry_enter(regs); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index dd694fb93916..fbf41dd142ca 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,7 +29,7 @@ /* * We need to define the tracepoints somewhere, and tlb.c - * is only compied when SMP=y. + * is only compiled when SMP=y. */ #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> @@ -756,7 +756,7 @@ void __init init_mem_mapping(void) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - /* can we preseve max_low_pfn ?*/ + /* can we preserve max_low_pfn ?*/ max_low_pfn = max_pfn; } #else @@ -939,7 +939,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, - * decompresser could be confused by aligned initrd_end + * decompressor could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b5a3fa4033d3..55247451ba85 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) /* * With folded p4d, pgd_none() is always false, we need to - * handle synchonization on p4d level. + * handle synchronization on p4d level. */ MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); p4d_ref = p4d_offset(pgd_ref, addr); @@ -986,7 +986,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were - * freed when offlining, or simplely not in use. + * freed when offlining, or simply not in use. */ if (!direct) free_pagetable(pte_page(*pte), 0); @@ -1004,7 +1004,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, * * If we are not removing the whole page, it means * other page structs in this page are being used and - * we canot remove them. So fill the unused page_structs + * we cannot remove them. So fill the unused page_structs * with 0xFD, and remove the page when it is wholly * filled with 0xFD. */ diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 6e6b39710e5f..557f0fe25dff 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt phyiscal memory region size based on available memory */ + /* Adapt physical memory region size based on available memory */ if (memory_tb < kaslr_regions[0].size_tb) kaslr_regions[0].size_tb = memory_tb; diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index be020a7bc414..d3efbc5b3449 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Support for MMIO probes. - * Benfit many code from kprobes + * Benefit many code from kprobes * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. * 2007 Alexander Eichner * 2008 Pekka Paalanen <pq@iki.fi> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4b01f7dbaf30..f633f9e23b8f 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -19,6 +19,7 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/dma-mapping.h> +#include <linux/virtio_config.h> #include <asm/tlbflush.h> #include <asm/fixmap.h> @@ -262,7 +263,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* @@ -484,3 +485,8 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } +int arch_has_restricted_virtio_memory_access(void) +{ + return sev_active(); +} +EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 7a84fc8bc5c3..17d292b7072f 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute) * - stack page (PAGE_SIZE) * - encryption routine page (PAGE_SIZE) * - intermediate copy buffer (PMD_PAGE_SIZE) - * R8 - physcial address of the pagetables to use for encryption + * R8 - physical address of the pagetables to use for encryption */ push %rbp diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 6c5eb6f3f14f..a19374d26101 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* * Check for the SME/SEV feature: @@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp) /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + /* For SME, check the SYSCFG MSR */ msr = __rdmsr(MSR_K8_SYSCFG); if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) return; } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* Save SEV_STATUS to avoid reading MSR again */ - sev_status = msr; - /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true; diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index ca311aaa67b8..3112ca7786ed 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -695,7 +695,7 @@ int memtype_free(u64 start, u64 end) /** - * lookup_memtype - Looksup the memory type for a physical address + * lookup_memtype - Looks up the memory type for a physical address * @paddr: physical address of which memory type needs to be looked up * * Only to be called when PAT is enabled @@ -800,6 +800,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end) memtype_free(start, end); } +#ifdef CONFIG_X86_PAT int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size) { enum page_cache_mode type = _PAGE_CACHE_MODE_WC; @@ -813,6 +814,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size) memtype_free_io(start, start + size); } EXPORT_SYMBOL(arch_io_free_memtype_wc); +#endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 16f878c26667..427980617557 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address) * end up in this kind of memory, for instance. * * This could be optimized, but it is only intended to be - * used at inititalization time, and keeping it + * used at initialization time, and keeping it * unoptimized should increase the testing coverage for * the more obscure platforms. */ diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 8873ed1438a9..a2332eef66e9 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -128,7 +128,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | /* * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where - * we know the FPU regstiers are safe for use and we can use PKRU + * we know the FPU registers are safe for use and we can use PKRU * directly. */ void copy_init_pkru_to_fpregs(void) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 1aab92930569..5d5c7bb50ce9 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end, * global, so set it as global in both copies. Note: * the X86_FEATURE_PGE check is not _required_ because * the CPU ignores _PAGE_GLOBAL when PGE is not - * supported. The check keeps consistentency with + * supported. The check keeps consistency with * code that only set this bit when supported. */ if (boot_cpu_has(X86_FEATURE_PGE)) @@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void) for_each_possible_cpu(cpu) { /* - * The SYSCALL64 entry code needs to be able to find the - * thread stack and needs one word of scratch space in which - * to spill a register. All of this lives in the TSS, in - * the sp1 and sp2 slots. + * The SYSCALL64 entry code needs one word of scratch space + * in which to spill a register. It lives in the sp2 slot + * of the CPU's TSS. * * This is done for all possible CPUs during boot to ensure * that it's propagated to all mms. @@ -512,7 +511,7 @@ static void pti_clone_entry_text(void) static inline bool pti_kernel_image_global_ok(void) { /* - * Systems with PCIDs get litlle benefit from global + * Systems with PCIDs get little benefit from global * kernel text and are not worth the downsides. */ if (cpu_feature_enabled(X86_FEATURE_PCID)) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 569ac1d57f55..98f269560d40 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -106,7 +106,7 @@ static inline u16 kern_pcid(u16 asid) #ifdef CONFIG_PAGE_TABLE_ISOLATION /* - * Make sure that the dynamic ASID space does not confict with the + * Make sure that the dynamic ASID space does not conflict with the * bit we are using to switch between user and kernel ASIDs. */ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); @@ -736,7 +736,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * - * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to @@ -876,7 +876,7 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, static inline void put_flush_tlb_info(void) { #ifdef CONFIG_DEBUG_VM - /* Complete reentrency prevention checks */ + /* Complete reentrancy prevention checks */ barrier(); this_cpu_dec(flush_tlb_info_idx); #endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..7b9e3ff27c1a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -282,7 +282,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); + memcpy(prog, x86_nops[5], cnt); prog += cnt; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) @@ -330,7 +330,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr, const bool text_live) { - const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + const u8 *nop_insn = x86_nops[5]; u8 old_insn[X86_PATCH_SIZE]; u8 new_insn[X86_PATCH_SIZE]; u8 *prog; @@ -560,7 +560,7 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, if (stack_depth) EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8)); - memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; /* out: */ @@ -881,7 +881,7 @@ static int emit_nops(u8 **pprog, int len) noplen = ASM_NOP_MAX; for (i = 0; i < noplen; i++) - EMIT1(ideal_nops[noplen][i]); + EMIT1(x86_nops[noplen][i]); len -= noplen; } @@ -1556,7 +1556,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ if (is_imm8(jmp_offset)) { if (jmp_padding) { /* To keep the jmp_offset valid, the extra bytes are - * padded before the jump insn, so we substract the + * padded before the jump insn, so we subtract the * 2 bytes of jmp_cond insn from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 @@ -1631,7 +1631,7 @@ emit_jmp: if (jmp_padding) { /* To avoid breaking jmp_offset, the extra bytes * are padded before the actual jmp insn, so - * 2 bytes is substracted from INSN_SZ_DIFF. + * 2 bytes is subtracted from INSN_SZ_DIFF. * * If the previous pass already emits an imm8 * jmp, there is nothing to pad (0 byte). @@ -1689,7 +1689,16 @@ emit_jmp: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } @@ -1936,7 +1945,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1984,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2011,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2020,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { @@ -2033,9 +2053,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ @@ -2225,7 +2253,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2345,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index d17b67c69f89..6a99def7d315 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2276,7 +2276,16 @@ notyet: } if (image) { - if (unlikely(proglen + ilen > oldproglen)) { + /* + * When populating the image, assert that: + * + * i) We do not write beyond the allocated space, and + * ii) addrs[i] did not change from the prior run, in order + * to validate assumptions made for computing branch + * displacements. + */ + if (unlikely(proglen + ilen > oldproglen || + proglen + ilen != addrs[i])) { pr_err("bpf_jit: fatal error\n"); return -EFAULT; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 0a0e168be1cb..02dc64625e64 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -375,7 +375,7 @@ static const struct dmi_system_id msi_k8t_dmi_table[] = { * The BIOS only gives options "DISABLED" and "AUTO". This code sets * the corresponding register-value to enable the soundcard. * - * The soundcard is only enabled, if the mainborad is identified + * The soundcard is only enabled, if the mainboard is identified * via DMI-tables and the soundcard is detected to be off. */ static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 1b82d77019b1..df7b5477fc4f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -195,7 +195,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) } /* - * Certain firmware versions are way too sentimential and still believe + * Certain firmware versions are way too sentimental and still believe * they are exclusive and unquestionable owners of the first physical page, * even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY * (but then write-access it later during SetVirtualAddressMap()). @@ -457,7 +457,7 @@ void __init efi_dump_pagetable(void) * in a kernel thread and user context. Preemption needs to remain disabled * while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm * can not change under us. - * It should be ensured that there are no concurent calls to this function. + * It should be ensured that there are no concurrent calls to this function. */ void efi_enter_mm(void) { diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 67d93a243c35..7850111008a8 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -441,7 +441,7 @@ void __init efi_free_boot_services(void) * 1.4.4 with SGX enabled booting Linux via Fedora 24's * grub2-efi on a hard disk. (And no, I don't know why * this happened, but Linux should still try to boot rather - * panicing early.) + * panicking early.) */ rm_size = real_mode_size_needed(); if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { @@ -726,7 +726,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr) * Buggy efi_reset_system() is handled differently from other EFI * Runtime Services as it doesn't use efi_rts_wq. Although, * native_machine_emergency_restart() says that machine_real_restart() - * could fail, it's better not to compilcate this fault handler + * could fail, it's better not to complicate this fault handler * because this case occurs *very* rarely and hence could be improved * on a need by basis. */ diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c index 0286fe1b14b5..d3d456925b2a 100644 --- a/arch/x86/platform/intel-quark/imr.c +++ b/arch/x86/platform/intel-quark/imr.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * imr.c -- Intel Isolated Memory Region driver * * Copyright(c) 2013 Intel Corporation. @@ -551,7 +551,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev) /* * Setup an unlocked IMR around the physical extent of the kernel - * from the beginning of the .text secton to the end of the + * from the beginning of the .text section to the end of the * .rodata section as one physically contiguous block. * * We don't round up @size since it is already PAGE_SIZE aligned. diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c index 570e3062faac..761f3689f60a 100644 --- a/arch/x86/platform/intel-quark/imr_selftest.c +++ b/arch/x86/platform/intel-quark/imr_selftest.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * imr_selftest.c -- Intel Isolated Memory Region self-test driver * * Copyright(c) 2013 Intel Corporation. diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 526f70f27c1c..fdd49d70b437 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -187,7 +187,7 @@ bool iosf_mbi_available(void) EXPORT_SYMBOL(iosf_mbi_available); /* - **************** P-Unit/kernel shared I2C bus arbritration **************** + **************** P-Unit/kernel shared I2C bus arbitration **************** * * Some Bay Trail and Cherry Trail devices have the P-Unit and us (the kernel) * share a single I2C bus to the PMIC. Below are helpers to arbitrate the @@ -493,7 +493,7 @@ static void iosf_sideband_debug_init(void) /* mcrx */ debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx); - /* mcr - initiates mailbox tranaction */ + /* mcr - initiates mailbox transaction */ debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops); } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 85f4638764d6..994a229cb79f 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -27,7 +27,7 @@ static bool lid_wake_on_close; * wake-on-close. This is implemented as standard by the XO-1.5 DSDT. * * We provide here a sysfs attribute that will additionally enable - * wake-on-close behavior. This is useful (e.g.) when we oportunistically + * wake-on-close behavior. This is useful (e.g.) when we opportunistically * suspend with the display running; if the lid is then closed, we want to * wake up to turn the display off. * diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 26d1f6693789..75e3319e8bee 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -131,7 +131,7 @@ void * __init prom_early_alloc(unsigned long size) const size_t chunk_size = max(PAGE_SIZE, size); /* - * To mimimize the number of allocations, grab at least + * To minimize the number of allocations, grab at least * PAGE_SIZE of memory (that's an arbitrary choice that's * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index d2ccadc247e6..72c1e42d121d 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -30,10 +30,10 @@ * the boot start info structure. * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. * - `cr4`: all bits are cleared. - * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ - * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `cs `: must be a 32-bit read/execute code segment with a base of `0` + * and a limit of `0xFFFFFFFF`. The selector value is unspecified. * - `ds`, `es`: must be a 32-bit read/write data segment with a base of - * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * `0` and a limit of `0xFFFFFFFF`. The selector values are all * unspecified. * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit * of '0x67'. @@ -46,10 +46,8 @@ #define PVH_GDT_ENTRY_CS 1 #define PVH_GDT_ENTRY_DS 2 -#define PVH_GDT_ENTRY_CANARY 3 #define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8) #define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8) -#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8) SYM_CODE_START_LOCAL(pvh_start_xen) cld @@ -111,17 +109,6 @@ SYM_CODE_START_LOCAL(pvh_start_xen) #else /* CONFIG_X86_64 */ - /* Set base address in stack canary descriptor. */ - movl $_pa(gdt_start),%eax - movl $_pa(canary),%ecx - movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax) - shrl $16, %ecx - movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax) - movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax) - - mov $PVH_CANARY_SEL,%eax - mov %eax,%gs - call mk_early_pgtbl_32 mov $_pa(initial_page_table), %eax @@ -165,7 +152,6 @@ SYM_DATA_START_LOCAL(gdt_start) .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */ #endif .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */ - .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */ SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end) .balign 16 diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index eafc530c8767..1e9ff28bc2e0 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -24,6 +24,7 @@ #include <asm/kdebug.h> #include <asm/local64.h> #include <asm/nmi.h> +#include <asm/reboot.h> #include <asm/traps.h> #include <asm/uv/uv.h> #include <asm/uv/uv_hub.h> @@ -91,6 +92,8 @@ static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); static atomic_t uv_nmi_slave_continue; static cpumask_var_t uv_nmi_cpu_mask; +static atomic_t uv_nmi_kexec_failed; + /* Values for uv_nmi_slave_continue */ #define SLAVE_CLEAR 0 #define SLAVE_CONTINUE 1 @@ -834,38 +837,35 @@ static void uv_nmi_touch_watchdogs(void) touch_nmi_watchdog(); } -static atomic_t uv_nmi_kexec_failed; - -#if defined(CONFIG_KEXEC_CORE) -static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +static void uv_nmi_kdump(int cpu, int main, struct pt_regs *regs) { + /* Check if kdump kernel loaded for both main and secondary CPUs */ + if (!kexec_crash_image) { + if (main) + pr_err("UV: NMI error: kdump kernel not loaded\n"); + return; + } + /* Call crash to dump system state */ - if (master) { + if (main) { pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu); crash_kexec(regs); - pr_emerg("UV: crash_kexec unexpectedly returned, "); + pr_emerg("UV: crash_kexec unexpectedly returned\n"); atomic_set(&uv_nmi_kexec_failed, 1); - if (!kexec_crash_image) { - pr_cont("crash kernel not loaded\n"); - return; - } - pr_cont("kexec busy, stalling cpus while waiting\n"); - } - /* If crash exec fails the slaves should return, otherwise stall */ - while (atomic_read(&uv_nmi_kexec_failed) == 0) - mdelay(10); -} + } else { /* secondary */ -#else /* !CONFIG_KEXEC_CORE */ -static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) -{ - if (master) - pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); - atomic_set(&uv_nmi_kexec_failed, 1); + /* If kdump kernel fails, secondaries will exit this loop */ + while (atomic_read(&uv_nmi_kexec_failed) == 0) { + + /* Once shootdown cpus starts, they do not return */ + run_crash_ipi_callback(regs); + + mdelay(10); + } + } } -#endif /* !CONFIG_KEXEC_CORE */ #ifdef CONFIG_KGDB #ifdef CONFIG_KGDB_KDB @@ -889,7 +889,7 @@ static inline int uv_nmi_kdb_reason(void) * Call KGDB/KDB from NMI handler * * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or - * 'kdb' has no affect on which is used. See the KGDB documention for further + * 'kdb' has no affect on which is used. See the KGDB documentation for further * information. */ static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index db1378c6ff26..3a070e7cdb8b 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -99,11 +99,8 @@ static void __save_processor_state(struct saved_context *ctxt) /* * segment registers */ -#ifdef CONFIG_X86_32_LAZY_GS savesegment(gs, ctxt->gs); -#endif #ifdef CONFIG_X86_64 - savesegment(gs, ctxt->gs); savesegment(fs, ctxt->fs); savesegment(ds, ctxt->ds); savesegment(es, ctxt->es); @@ -232,7 +229,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base); #else loadsegment(fs, __KERNEL_PERCPU); - loadsegment(gs, __KERNEL_STACK_CANARY); #endif /* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */ @@ -255,7 +251,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) */ wrmsrl(MSR_FS_BASE, ctxt->fs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base); -#elif defined(CONFIG_X86_32_LAZY_GS) +#else loadsegment(gs, ctxt->gs); #endif @@ -321,7 +317,7 @@ int hibernate_resume_nonboot_cpu_disable(void) /* * When bsp_check() is called in hibernate and suspend, cpu hotplug - * is disabled already. So it's unnessary to handle race condition between + * is disabled already. So it's unnecessary to handle race condition between * cpumask query and cpu hotplug. */ static int bsp_check(void) diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index cd3914fc9f3d..e94e0050a583 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -13,8 +13,8 @@ #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/pgtable.h> - -#include <crypto/hash.h> +#include <linux/types.h> +#include <linux/crc32.h> #include <asm/e820/api.h> #include <asm/init.h> @@ -54,95 +54,33 @@ int pfn_is_nosave(unsigned long pfn) return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn; } - -#define MD5_DIGEST_SIZE 16 - struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; - u8 e820_digest[MD5_DIGEST_SIZE]; + unsigned long e820_checksum; }; -#if IS_BUILTIN(CONFIG_CRYPTO_MD5) /** - * get_e820_md5 - calculate md5 according to given e820 table + * compute_e820_crc32 - calculate crc32 of a given e820 table * * @table: the e820 table to be calculated - * @buf: the md5 result to be stored to + * + * Return: the resulting checksum */ -static int get_e820_md5(struct e820_table *table, void *buf) +static inline u32 compute_e820_crc32(struct e820_table *table) { - struct crypto_shash *tfm; - struct shash_desc *desc; - int size; - int ret = 0; - - tfm = crypto_alloc_shash("md5", 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), - GFP_KERNEL); - if (!desc) { - ret = -ENOMEM; - goto free_tfm; - } - - desc->tfm = tfm; - - size = offsetof(struct e820_table, entries) + + int size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; - if (crypto_shash_digest(desc, (u8 *)table, size, buf)) - ret = -EINVAL; - - kfree_sensitive(desc); - -free_tfm: - crypto_free_shash(tfm); - return ret; -} - -static int hibernation_e820_save(void *buf) -{ - return get_e820_md5(e820_table_firmware, buf); -} - -static bool hibernation_e820_mismatch(void *buf) -{ - int ret; - u8 result[MD5_DIGEST_SIZE]; - - memset(result, 0, MD5_DIGEST_SIZE); - /* If there is no digest in suspend kernel, let it go. */ - if (!memcmp(result, buf, MD5_DIGEST_SIZE)) - return false; - - ret = get_e820_md5(e820_table_firmware, result); - if (ret) - return true; - - return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; -} -#else -static int hibernation_e820_save(void *buf) -{ - return 0; -} - -static bool hibernation_e820_mismatch(void *buf) -{ - /* If md5 is not builtin for restore kernel, let it go. */ - return false; + return ~crc32_le(~0, (unsigned char const *)table, size); } -#endif #ifdef CONFIG_X86_64 -#define RESTORE_MAGIC 0x23456789ABCDEF01UL +#define RESTORE_MAGIC 0x23456789ABCDEF02UL #else -#define RESTORE_MAGIC 0x12345678UL +#define RESTORE_MAGIC 0x12345679UL #endif /** @@ -179,7 +117,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) */ rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK; - return hibernation_e820_save(rdr->e820_digest); + rdr->e820_checksum = compute_e820_crc32(e820_table_firmware); + return 0; } /** @@ -200,7 +139,7 @@ int arch_hibernation_header_restore(void *addr) jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - if (hibernation_e820_mismatch(rdr->e820_digest)) { + if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) { pr_crit("Hibernate inconsistent memory map detected!\n"); return -ENODEV; } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 22fda7d99159..1be71ef5e4c4 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -103,7 +103,7 @@ static void __init setup_real_mode(void) *ptr += phys_base; } - /* Must be perfomed *after* relocation. */ + /* Must be performed *after* relocation. */ trampoline_header = (struct trampoline_header *) __va(real_mode_header->trampoline_header); diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index 34eda63c124b..472540aeabc2 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -120,7 +120,7 @@ int main(int argc, char **argv) while (fgets(line, BUFSIZE, stdin)) { char copy[BUFSIZE], *s, *tab1, *tab2; - int nb = 0; + int nb = 0, ret; unsigned int b; if (line[0] == '<') { @@ -148,10 +148,12 @@ int main(int argc, char **argv) } else break; } + /* Decode an instruction */ - insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); - insn_get_length(&insn); - if (insn.length != nb) { + ret = insn_decode(&insn, insn_buff, sizeof(insn_buff), + x86_64 ? INSN_MODE_64 : INSN_MODE_32); + + if (ret < 0 || insn.length != nb) { warnings++; pr_warn("Found an x86 instruction decoder bug, " "please report this.\n", sym); diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index c6a0000ae635..213f35f94feb 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -218,8 +218,8 @@ static void parse_args(int argc, char **argv) int main(int argc, char **argv) { + int insns = 0, ret; struct insn insn; - int insns = 0; int errors = 0; unsigned long i; unsigned char insn_buff[MAX_INSN_SIZE * 2]; @@ -237,15 +237,15 @@ int main(int argc, char **argv) continue; /* Decode an instruction */ - insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); - insn_get_length(&insn); + ret = insn_decode(&insn, insn_buff, sizeof(insn_buff), + x86_64 ? INSN_MODE_64 : INSN_MODE_32); if (insn.next_byte <= insn.kaddr || insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { /* Access out-of-range memory */ dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn); errors++; - } else if (verbose && !insn_complete(&insn)) + } else if (verbose && ret < 0) dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn); else if (verbose >= 2) dump_insn(stdout, &insn); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index dc0a337f985b..17503fed2017 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1070,8 +1070,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_pmc = xen_read_pmc, - .iret = xen_iret, - .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, @@ -1204,7 +1202,6 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - setup_stack_canary_segment(cpu); switch_to_new_gdt(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; @@ -1233,8 +1230,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; - pv_ops.init.patch = paravirt_patch_default; pv_ops.cpu = xen_cpu_ops; + paravirt_iret = xen_iret; xen_init_irq_ops(); /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index cf2ade864c30..1e28c880f642 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2410,7 +2410,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, rmd.prot = prot; /* * We use the err_ptr to indicate if there we are doing a contiguous - * mapping or a discontigious mapping. + * mapping or a discontiguous mapping. */ rmd.contiguous = !err_ptr; rmd.no_translate = no_translate; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 17d80f751fcb..ac06ca32e9ef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT -#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT #else #define P2M_LIMIT 0 #endif @@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void) xen_p2m_last_pfn = xen_max_p2m_pfn; p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; - if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC)) - p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO; - vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), PMD_SIZE * PMDS_PER_MID_PAGE); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a3b75652fa4..8bfc10330107 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -59,6 +59,18 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -778,13 +790,13 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_table.entries[0].addr; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 91f5b330dcc6..d9c945ee1100 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -379,11 +379,6 @@ void xen_timer_resume(void) } } -static const struct pv_time_ops xen_time_ops __initconst = { - .sched_clock = xen_sched_clock, - .steal_clock = xen_steal_clock, -}; - static struct pvclock_vsyscall_time_info *xen_clock __read_mostly; static u64 xen_clock_value_saved; @@ -525,17 +520,24 @@ static void __init xen_time_init(void) pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier); } -void __init xen_init_time_ops(void) +static void __init xen_init_time_common(void) { xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + static_call_update(pv_steal_clock, xen_steal_clock); + paravirt_set_sched_clock(xen_sched_clock); + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; +} + +void __init xen_init_time_ops(void) +{ + xen_init_time_common(); x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; /* Dom0 uses the native method to set the hardware RTC. */ if (!xen_initial_domain()) x86_platform.set_wallclock = xen_set_wallclock; @@ -569,13 +571,11 @@ void __init xen_hvm_init_time_ops(void) return; } - xen_sched_clock_offset = xen_clocksource_read(); - pv_ops.time = xen_time_ops; + xen_init_time_common(); + x86_init.timers.setup_percpu_clockev = xen_time_init; x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; } #endif |