diff options
Diffstat (limited to 'arch/x86')
253 files changed, 4651 insertions, 8046 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4bbebe1d72f..887bf86c447a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -68,6 +68,7 @@ config X86 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 @@ -147,6 +148,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES + select HAVE_ASM_MODVERSIONS select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 @@ -794,6 +796,7 @@ config KVM_GUEST bool "KVM Guest support (including kvmclock)" depends on PARAVIRT select PARAVIRT_CLOCK + select ARCH_CPUIDLE_HALTPOLL default y ---help--- This option enables various optimizations for running under the KVM @@ -802,6 +805,12 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config ARCH_CPUIDLE_HALTPOLL + def_bool n + prompt "Disable host haltpoll when loading haltpoll driver" + help + If virtualized under KVM, disable host haltpoll. + config PVH bool "Support for running PVH guests" ---help--- @@ -1519,9 +1528,6 @@ config X86_CPA_STATISTICS helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. -config ARCH_HAS_MEM_ENCRYPT - def_bool y - config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD @@ -1935,6 +1941,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI @@ -2026,20 +2077,30 @@ config KEXEC_FILE config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE ---help--- - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. - In addition to that option, you need to enable signature + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature verification for the corresponding kernel image type being loaded in order for this to work. +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on KEXEC_SIG + ---help--- + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + config KEXEC_BZIMAGE_VERIFY_SIG bool "Enable bzImage signature verification support" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on SIGNED_PE_FILE_VERIFICATION select SYSTEM_TRUSTED_KEYRING ---help--- diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 71c92db47c41..bf9cd83de777 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -171,7 +171,7 @@ config HAVE_MMIOTRACE_SUPPORT config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL && KPROBES + depends on DEBUG_KERNEL && INSTRUCTION_DECODER depends on !COMPILE_TEST ---help--- Perform x86 instruction decoder selftests at build time. diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 19eca14b49a0..ca866f1cca2e 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -28,8 +28,6 @@ #include "cpuflags.h" /* Useful macros */ -#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) - #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) extern struct setup_header hdr; diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 15255f388a85..25019d42ae93 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -21,30 +21,6 @@ struct mem_vector immovable_mem[MAX_NUMNODES*2]; /* - * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex - * digits, and '\0' for termination. - */ -#define MAX_ADDR_LEN 19 - -static acpi_physical_address get_acpi_rsdp(void) -{ - acpi_physical_address addr = 0; - -#ifdef CONFIG_KEXEC - char val[MAX_ADDR_LEN] = { }; - int ret; - - ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); - if (ret < 0) - return 0; - - if (kstrtoull(val, 16, &addr)) - return 0; -#endif - return addr; -} - -/* * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and * ACPI_TABLE_GUID are found, take the former, which has more features. */ @@ -278,10 +254,7 @@ acpi_physical_address get_rsdp_addr(void) { acpi_physical_address pa; - pa = get_acpi_rsdp(); - - if (!pa) - pa = boot_params->acpi_rsdp_addr; + pa = boot_params->acpi_rsdp_addr; /* * Try to get EFI data from setup_data. This can happen when we're a @@ -301,6 +274,30 @@ acpi_physical_address get_rsdp_addr(void) } #if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) +/* + * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex + * digits, and '\0' for termination. + */ +#define MAX_ADDR_LEN 19 + +static acpi_physical_address get_cmdline_acpi_rsdp(void) +{ + acpi_physical_address addr = 0; + +#ifdef CONFIG_KEXEC + char val[MAX_ADDR_LEN] = { }; + int ret; + + ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); + if (ret < 0) + return 0; + + if (kstrtoull(val, 16, &addr)) + return 0; +#endif + return addr; +} + /* Compute SRAT address from RSDP. */ static unsigned long get_acpi_srat_table(void) { @@ -311,7 +308,17 @@ static unsigned long get_acpi_srat_table(void) char arg[10]; u8 *entry; - rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr; + /* + * Check whether we were given an RSDP on the command line. We don't + * stash this in boot params because the kernel itself may have + * different ideas about whether to trust a command-line parameter. + */ + rsdp = (struct acpi_table_rsdp *)get_cmdline_acpi_rsdp(); + + if (!rsdp) + rsdp = (struct acpi_table_rsdp *)(long) + boot_params->acpi_rsdp_addr; + if (!rsdp) return 0; diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index d6662fdef300..82bc60c8acb2 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -13,6 +13,7 @@ #include <asm/e820/types.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/boot.h> #include "../string.h" #include "eboot.h" @@ -813,7 +814,8 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) status = efi_relocate_kernel(sys_table, &bzimage_addr, hdr->init_size, hdr->init_size, hdr->pref_address, - hdr->kernel_alignment); + hdr->kernel_alignment, + LOAD_PHYSICAL_ADDR); if (status != EFI_SUCCESS) { efi_printk(sys_table, "efi_relocate_kernel() failed!\n"); goto fail; diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 37380c0d5999..5e30eaaf8576 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -140,7 +140,7 @@ ENTRY(startup_32) /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax + leal .Lrelocated(%ebx), %eax jmp *%eax ENDPROC(startup_32) @@ -209,7 +209,7 @@ ENDPROC(efi32_stub_entry) #endif .text -relocated: +.Lrelocated: /* * Clear BSS (stack is currently empty) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6233ae35d0d9..d98cd483377e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -87,7 +87,7 @@ ENTRY(startup_32) call verify_cpu testl %eax, %eax - jnz no_longmode + jnz .Lno_longmode /* * Compute the delta between where we were compiled to run at @@ -322,7 +322,7 @@ ENTRY(startup_64) 1: popq %rdi subq $1b, %rdi - call adjust_got + call .Ladjust_got /* * At this point we are in long mode with 4-level paging enabled, @@ -421,7 +421,7 @@ trampoline_return: /* The new adjustment is the relocation address */ movq %rbx, %rdi - call adjust_got + call .Ladjust_got /* * Copy the compressed kernel to the end of our buffer @@ -440,7 +440,7 @@ trampoline_return: /* * Jump to the relocated address. */ - leaq relocated(%rbx), %rax + leaq .Lrelocated(%rbx), %rax jmp *%rax #ifdef CONFIG_EFI_STUB @@ -511,7 +511,7 @@ ENDPROC(efi64_stub_entry) #endif .text -relocated: +.Lrelocated: /* * Clear BSS (stack is currently empty) @@ -548,7 +548,7 @@ relocated: * first time we touch GOT). * RDI is the new adjustment to apply. */ -adjust_got: +.Ladjust_got: /* Walk through the GOT adding the address to the entries */ leaq _got(%rip), %rdx leaq _egot(%rip), %rcx @@ -622,7 +622,7 @@ ENTRY(trampoline_32bit_src) movl %eax, %cr4 /* Calculate address of paging_enabled() once we are executing in the trampoline */ - leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax + leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS @@ -635,7 +635,7 @@ ENTRY(trampoline_32bit_src) lret .code64 -paging_enabled: +.Lpaging_enabled: /* Return from the trampoline */ jmp *%rdi @@ -647,7 +647,7 @@ paging_enabled: .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE .code32 -no_longmode: +.Lno_longmode: /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 53ac0cb2396d..9652d5c2afda 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -345,6 +345,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, { const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + unsigned long needed_size; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -379,26 +380,38 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + * + * On X86_64, the memory is mapped with PMD pages. Round the + * size up so that the full extent of PMD pages mapped is + * included in the check against the valid memory table + * entries. This ensures the full mapped area is usable RAM + * and doesn't include any reserved areas. + */ + needed_size = max(output_len, kernel_total_size); +#ifdef CONFIG_X86_64 + needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); +#endif + /* Report initial kernel position details. */ debug_putaddr(input_data); debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); debug_putaddr(kernel_total_size); + debug_putaddr(needed_size); #ifdef CONFIG_X86_64 /* Report address of 32-bit trampoline */ debug_putaddr(trampoline_32bit); #endif - /* - * The memory hole needed for the kernel is the larger of either - * the entire decompressed kernel plus relocation table, or the - * entire decompressed kernel plus .bss and .brk sections. - */ choose_random_location((unsigned long)input_data, input_len, (unsigned long *)&output, - max(output_len, kernel_total_size), + needed_size, &virt_addr); /* Validate memory location choices. */ diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 996df3d586f0..e3add857c2c9 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -10,6 +10,7 @@ /* * Main module for the real-mode kernel code */ +#include <linux/build_bug.h> #include "boot.h" #include "string.h" diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 45734e1cf967..759b1a927826 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -14,11 +14,9 @@ sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o -obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o -obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o @@ -38,14 +36,6 @@ obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o -obj-$(CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2) += aegis128l-aesni.o -obj-$(CONFIG_CRYPTO_AEGIS256_AESNI_SSE2) += aegis256-aesni.o - -obj-$(CONFIG_CRYPTO_MORUS640_GLUE) += morus640_glue.o -obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o - -obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o -obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o @@ -64,15 +54,11 @@ endif ifeq ($(avx2_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o - - obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o endif -aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o -aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o @@ -82,11 +68,6 @@ chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o -aegis128l-aesni-y := aegis128l-aesni-asm.o aegis128l-aesni-glue.o -aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o - -morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o -morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o @@ -106,8 +87,6 @@ ifeq ($(avx2_supported),yes) chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o - morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o - nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o endif diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S deleted file mode 100644 index 1461ef00c0e8..000000000000 --- a/arch/x86/crypto/aegis128l-aesni-asm.S +++ /dev/null @@ -1,823 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AES-NI + SSE2 implementation of AEGIS-128L - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define STATE5 %xmm5 -#define STATE6 %xmm6 -#define STATE7 %xmm7 -#define MSG0 %xmm8 -#define MSG1 %xmm9 -#define T0 %xmm10 -#define T1 %xmm11 -#define T2 %xmm12 -#define T3 %xmm13 - -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - -.section .rodata.cst16.aegis128l_const, "aM", @progbits, 32 -.align 16 -.Laegis128l_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Laegis128l_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.aegis128l_counter, "aM", @progbits, 16 -.align 16 -.Laegis128l_counter0: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -.Laegis128l_counter1: - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -/* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG0 - first message block - * MSG1 - second message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG0, MSG0 - pxor MSG1, MSG1 - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG0 - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG0 - movq (%r8), T0 - pxor T0, MSG0 - -.Lld_partial_8: - mov LEN, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - movdqa MSG0, MSG1 - movdqu (SRC), MSG0 - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - first message block - * T1 - second message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov LEN, %r8 - mov DST, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0, (%r9) - movdqa T1, T0 - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -.macro update - movdqa STATE7, T0 - aesenc STATE0, STATE7 - aesenc STATE1, STATE0 - aesenc STATE2, STATE1 - aesenc STATE3, STATE2 - aesenc STATE4, STATE3 - aesenc STATE5, STATE4 - aesenc STATE6, STATE5 - aesenc T0, STATE6 -.endm - -.macro update0 - update - pxor MSG0, STATE7 - pxor MSG1, STATE3 -.endm - -.macro update1 - update - pxor MSG0, STATE6 - pxor MSG1, STATE2 -.endm - -.macro update2 - update - pxor MSG0, STATE5 - pxor MSG1, STATE1 -.endm - -.macro update3 - update - pxor MSG0, STATE4 - pxor MSG1, STATE0 -.endm - -.macro update4 - update - pxor MSG0, STATE3 - pxor MSG1, STATE7 -.endm - -.macro update5 - update - pxor MSG0, STATE2 - pxor MSG1, STATE6 -.endm - -.macro update6 - update - pxor MSG0, STATE1 - pxor MSG1, STATE5 -.endm - -.macro update7 - update - pxor MSG0, STATE0 - pxor MSG1, STATE4 -.endm - -.macro state_load - movdqu 0x00(STATEP), STATE0 - movdqu 0x10(STATEP), STATE1 - movdqu 0x20(STATEP), STATE2 - movdqu 0x30(STATEP), STATE3 - movdqu 0x40(STATEP), STATE4 - movdqu 0x50(STATEP), STATE5 - movdqu 0x60(STATEP), STATE6 - movdqu 0x70(STATEP), STATE7 -.endm - -.macro state_store s0 s1 s2 s3 s4 s5 s6 s7 - movdqu \s7, 0x00(STATEP) - movdqu \s0, 0x10(STATEP) - movdqu \s1, 0x20(STATEP) - movdqu \s2, 0x30(STATEP) - movdqu \s3, 0x40(STATEP) - movdqu \s4, 0x50(STATEP) - movdqu \s5, 0x60(STATEP) - movdqu \s6, 0x70(STATEP) -.endm - -.macro state_store0 - state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 -.endm - -.macro state_store1 - state_store STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 -.endm - -.macro state_store2 - state_store STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro state_store3 - state_store STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro state_store4 - state_store STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro state_store5 - state_store STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 -.endm - -.macro state_store6 - state_store STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 -.endm - -.macro state_store7 - state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 -.endm - -/* - * void crypto_aegis128l_aesni_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_aegis128l_aesni_init) - FRAME_BEGIN - - /* load key: */ - movdqa (%rsi), MSG1 - movdqa MSG1, STATE0 - movdqa MSG1, STATE4 - movdqa MSG1, STATE5 - movdqa MSG1, STATE6 - movdqa MSG1, STATE7 - - /* load IV: */ - movdqu (%rdx), MSG0 - pxor MSG0, STATE0 - pxor MSG0, STATE4 - - /* load the constants: */ - movdqa .Laegis128l_const_0, STATE2 - movdqa .Laegis128l_const_1, STATE1 - movdqa STATE1, STATE3 - pxor STATE2, STATE5 - pxor STATE1, STATE6 - pxor STATE2, STATE7 - - /* update 10 times with IV and KEY: */ - update0 - update1 - update2 - update3 - update4 - update5 - update6 - update7 - update0 - update1 - - state_store1 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_init) - -.macro ad_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - update\i - sub $0x20, LEN - cmp $0x20, LEN - jl .Lad_out_\i -.endm - -/* - * void crypto_aegis128l_aesni_ad(void *state, unsigned int length, - * const void *data); - */ -ENTRY(crypto_aegis128l_aesni_ad) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Lad_out - - state_load - - mov SRC, %r8 - and $0xf, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - ad_block a 0 - ad_block a 1 - ad_block a 2 - ad_block a 3 - ad_block a 4 - ad_block a 5 - ad_block a 6 - ad_block a 7 - - add $0x100, SRC - jmp .Lad_a_loop - -.align 8 -.Lad_u_loop: - ad_block u 0 - ad_block u 1 - ad_block u 2 - ad_block u 3 - ad_block u 4 - ad_block u 5 - ad_block u 6 - ad_block u 7 - - add $0x100, SRC - jmp .Lad_u_loop - -.Lad_out_0: - state_store0 - FRAME_END - ret - -.Lad_out_1: - state_store1 - FRAME_END - ret - -.Lad_out_2: - state_store2 - FRAME_END - ret - -.Lad_out_3: - state_store3 - FRAME_END - ret - -.Lad_out_4: - state_store4 - FRAME_END - ret - -.Lad_out_5: - state_store5 - FRAME_END - ret - -.Lad_out_6: - state_store6 - FRAME_END - ret - -.Lad_out_7: - state_store7 - FRAME_END - ret - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_ad) - -.macro crypt m0 m1 s0 s1 s2 s3 s4 s5 s6 s7 - pxor \s1, \m0 - pxor \s6, \m0 - movdqa \s2, T3 - pand \s3, T3 - pxor T3, \m0 - - pxor \s2, \m1 - pxor \s5, \m1 - movdqa \s6, T3 - pand \s7, T3 - pxor T3, \m1 -.endm - -.macro crypt0 m0 m1 - crypt \m0 \m1 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 -.endm - -.macro crypt1 m0 m1 - crypt \m0 \m1 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 -.endm - -.macro crypt2 m0 m1 - crypt \m0 \m1 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro crypt3 m0 m1 - crypt \m0 \m1 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro crypt4 m0 m1 - crypt \m0 \m1 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro crypt5 m0 m1 - crypt \m0 \m1 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 STATE2 -.endm - -.macro crypt6 m0 m1 - crypt \m0 \m1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 STATE1 -.endm - -.macro crypt7 m0 m1 - crypt \m0 \m1 STATE1 STATE2 STATE3 STATE4 STATE5 STATE6 STATE7 STATE0 -.endm - -.macro encrypt_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - movdqa MSG0, T0 - movdqa MSG1, T1 - crypt\i T0, T1 - movdq\a T0, (\i * 0x20 + 0x00)(DST) - movdq\a T1, (\i * 0x20 + 0x10)(DST) - - update\i - - sub $0x20, LEN - cmp $0x20, LEN - jl .Lenc_out_\i -.endm - -.macro decrypt_block a i - movdq\a (\i * 0x20 + 0x00)(SRC), MSG0 - movdq\a (\i * 0x20 + 0x10)(SRC), MSG1 - crypt\i MSG0, MSG1 - movdq\a MSG0, (\i * 0x20 + 0x00)(DST) - movdq\a MSG1, (\i * 0x20 + 0x10)(DST) - - update\i - - sub $0x20, LEN - cmp $0x20, LEN - jl .Ldec_out_\i -.endm - -/* - * void crypto_aegis128l_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_enc) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Lenc_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xf, %r8 - jnz .Lenc_u_loop - -.align 8 -.Lenc_a_loop: - encrypt_block a 0 - encrypt_block a 1 - encrypt_block a 2 - encrypt_block a 3 - encrypt_block a 4 - encrypt_block a 5 - encrypt_block a 6 - encrypt_block a 7 - - add $0x100, SRC - add $0x100, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u 0 - encrypt_block u 1 - encrypt_block u 2 - encrypt_block u 3 - encrypt_block u 4 - encrypt_block u 5 - encrypt_block u 6 - encrypt_block u 7 - - add $0x100, SRC - add $0x100, DST - jmp .Lenc_u_loop - -.Lenc_out_0: - state_store0 - FRAME_END - ret - -.Lenc_out_1: - state_store1 - FRAME_END - ret - -.Lenc_out_2: - state_store2 - FRAME_END - ret - -.Lenc_out_3: - state_store3 - FRAME_END - ret - -.Lenc_out_4: - state_store4 - FRAME_END - ret - -.Lenc_out_5: - state_store5 - FRAME_END - ret - -.Lenc_out_6: - state_store6 - FRAME_END - ret - -.Lenc_out_7: - state_store7 - FRAME_END - ret - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_enc) - -/* - * void crypto_aegis128l_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_enc_tail) - FRAME_BEGIN - - state_load - - /* encrypt message: */ - call __load_partial - - movdqa MSG0, T0 - movdqa MSG1, T1 - crypt0 T0, T1 - - call __store_partial - - update0 - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_enc_tail) - -/* - * void crypto_aegis128l_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_dec) - FRAME_BEGIN - - cmp $0x20, LEN - jb .Ldec_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 8 -.Ldec_a_loop: - decrypt_block a 0 - decrypt_block a 1 - decrypt_block a 2 - decrypt_block a 3 - decrypt_block a 4 - decrypt_block a 5 - decrypt_block a 6 - decrypt_block a 7 - - add $0x100, SRC - add $0x100, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u 0 - decrypt_block u 1 - decrypt_block u 2 - decrypt_block u 3 - decrypt_block u 4 - decrypt_block u 5 - decrypt_block u 6 - decrypt_block u 7 - - add $0x100, SRC - add $0x100, DST - jmp .Ldec_u_loop - -.Ldec_out_0: - state_store0 - FRAME_END - ret - -.Ldec_out_1: - state_store1 - FRAME_END - ret - -.Ldec_out_2: - state_store2 - FRAME_END - ret - -.Ldec_out_3: - state_store3 - FRAME_END - ret - -.Ldec_out_4: - state_store4 - FRAME_END - ret - -.Ldec_out_5: - state_store5 - FRAME_END - ret - -.Ldec_out_6: - state_store6 - FRAME_END - ret - -.Ldec_out_7: - state_store7 - FRAME_END - ret - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_dec) - -/* - * void crypto_aegis128l_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis128l_aesni_dec_tail) - FRAME_BEGIN - - state_load - - /* decrypt message: */ - call __load_partial - - crypt0 MSG0, MSG1 - - movdqa MSG0, T0 - movdqa MSG1, T1 - call __store_partial - - /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa T0, T1 - movdqa .Laegis128l_counter0, T2 - movdqa .Laegis128l_counter1, T3 - pcmpgtb T2, T0 - pcmpgtb T3, T1 - pand T0, MSG0 - pand T1, MSG1 - - update0 - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_dec_tail) - -/* - * void crypto_aegis128l_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_aegis128l_aesni_final) - FRAME_BEGIN - - state_load - - /* prepare length block: */ - movq %rdx, MSG0 - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG0 - psllq $3, MSG0 /* multiply by 8 (to get bit count) */ - - pxor STATE2, MSG0 - movdqa MSG0, MSG1 - - /* update state: */ - update0 - update1 - update2 - update3 - update4 - update5 - update6 - - /* xor tag: */ - movdqu (%rsi), T0 - - pxor STATE1, T0 - pxor STATE2, T0 - pxor STATE3, T0 - pxor STATE4, T0 - pxor STATE5, T0 - pxor STATE6, T0 - pxor STATE7, T0 - - movdqu T0, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_aegis128l_aesni_final) diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c deleted file mode 100644 index 19eb28b316f0..000000000000 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The AEGIS-128L Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -#define AEGIS128L_BLOCK_ALIGN 16 -#define AEGIS128L_BLOCK_SIZE 32 -#define AEGIS128L_NONCE_SIZE 16 -#define AEGIS128L_STATE_BLOCKS 8 -#define AEGIS128L_KEY_SIZE 16 -#define AEGIS128L_MIN_AUTH_SIZE 8 -#define AEGIS128L_MAX_AUTH_SIZE 16 - -asmlinkage void crypto_aegis128l_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis128l_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis128l_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis128l_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - -struct aegis_block { - u8 bytes[AEGIS128L_BLOCK_SIZE] __aligned(AEGIS128L_BLOCK_ALIGN); -}; - -struct aegis_state { - struct aegis_block blocks[AEGIS128L_STATE_BLOCKS]; -}; - -struct aegis_ctx { - struct aegis_block key; -}; - -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; - -static void crypto_aegis128l_aesni_process_ad( - struct aegis_state *state, struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct aegis_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= AEGIS128L_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = AEGIS128L_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - crypto_aegis128l_aesni_ad(state, - AEGIS128L_BLOCK_SIZE, - buf.bytes); - pos = 0; - left -= fill; - src += fill; - } - - crypto_aegis128l_aesni_ad(state, left, src); - - src += left & ~(AEGIS128L_BLOCK_SIZE - 1); - left &= AEGIS128L_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - pos += left; - assoclen -= size; - - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, AEGIS128L_BLOCK_SIZE - pos); - crypto_aegis128l_aesni_ad(state, AEGIS128L_BLOCK_SIZE, buf.bytes); - } -} - -static void crypto_aegis128l_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) -{ - while (walk->nbytes >= AEGIS128L_BLOCK_SIZE) { - ops->crypt_blocks(state, round_down(walk->nbytes, - AEGIS128L_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); - skcipher_walk_done(walk, walk->nbytes % AEGIS128L_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); - skcipher_walk_done(walk, 0); - } -} - -static struct aegis_ctx *crypto_aegis128l_aesni_ctx(struct crypto_aead *aead) -{ - u8 *ctx = crypto_aead_ctx(aead); - ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); - return (void *)ctx; -} - -static int crypto_aegis128l_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(aead); - - if (keylen != AEGIS128L_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, AEGIS128L_KEY_SIZE); - - return 0; -} - -static int crypto_aegis128l_aesni_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - if (authsize > AEGIS128L_MAX_AUTH_SIZE) - return -EINVAL; - if (authsize < AEGIS128L_MIN_AUTH_SIZE) - return -EINVAL; - return 0; -} - -static void crypto_aegis128l_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm); - struct skcipher_walk walk; - struct aegis_state state; - - ops->skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv); - crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis128l_aesni_process_crypt(&state, &walk, ops); - crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -static int crypto_aegis128l_aesni_encrypt(struct aead_request *req) -{ - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis128l_aesni_enc, - .crypt_tail = crypto_aegis128l_aesni_enc_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_aegis128l_aesni_decrypt(struct aead_request *req) -{ - static const struct aegis_block zeros = {}; - - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis128l_aesni_dec, - .crypt_tail = crypto_aegis128l_aesni_dec_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_aegis128l_aesni_crypt(req, &tag, cryptlen, &OPS); - - return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; -} - -static int crypto_aegis128l_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - -static struct aead_alg crypto_aegis128l_aesni_alg = { - .setkey = crypto_aegis128l_aesni_setkey, - .setauthsize = crypto_aegis128l_aesni_setauthsize, - .encrypt = crypto_aegis128l_aesni_encrypt, - .decrypt = crypto_aegis128l_aesni_decrypt, - .init = crypto_aegis128l_aesni_init_tfm, - .exit = crypto_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - .cra_priority = 400, - - .cra_name = "__aegis128l", - .cra_driver_name = "__aegis128l-aesni", - - .cra_module = THIS_MODULE, - } -}; - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_aegis128l_aesni_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_AES) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_aegis128l_aesni_alg, 1, - &simd_alg); -} - -static void __exit crypto_aegis128l_aesni_module_exit(void) -{ - simd_unregister_aeads(&crypto_aegis128l_aesni_alg, 1, &simd_alg); -} - -module_init(crypto_aegis128l_aesni_module_init); -module_exit(crypto_aegis128l_aesni_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-128L AEAD algorithm -- AESNI+SSE2 implementation"); -MODULE_ALIAS_CRYPTO("aegis128l"); -MODULE_ALIAS_CRYPTO("aegis128l-aesni"); diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S deleted file mode 100644 index 37d9b13dfd85..000000000000 --- a/arch/x86/crypto/aegis256-aesni-asm.S +++ /dev/null @@ -1,700 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AES-NI + SSE2 implementation of AEGIS-128L - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define STATE5 %xmm5 -#define MSG %xmm6 -#define T0 %xmm7 -#define T1 %xmm8 -#define T2 %xmm9 -#define T3 %xmm10 - -#define STATEP %rdi -#define LEN %rsi -#define SRC %rdx -#define DST %rcx - -.section .rodata.cst16.aegis256_const, "aM", @progbits, 32 -.align 16 -.Laegis256_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Laegis256_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.aegis256_counter, "aM", @progbits, 16 -.align 16 -.Laegis256_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - -.text - -/* - * __load_partial: internal ABI - * input: - * LEN - bytes - * SRC - src - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG, MSG - - mov LEN, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov LEN, %r8 - and $0x1E, %r8 - add SRC, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov LEN, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov LEN, %r8 - and $0x1C, %r8 - add SRC, %r8 - shl $0x10, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov LEN, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov LEN, %r8 - and $0x18, %r8 - add SRC, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov LEN, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov LEN, %r8 - and $0x10, %r8 - add SRC, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * LEN - bytes - * DST - dst - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov LEN, %r8 - mov DST, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $0x10, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -.macro update - movdqa STATE5, T0 - aesenc STATE0, STATE5 - aesenc STATE1, STATE0 - aesenc STATE2, STATE1 - aesenc STATE3, STATE2 - aesenc STATE4, STATE3 - aesenc T0, STATE4 -.endm - -.macro update0 m - update - pxor \m, STATE5 -.endm - -.macro update1 m - update - pxor \m, STATE4 -.endm - -.macro update2 m - update - pxor \m, STATE3 -.endm - -.macro update3 m - update - pxor \m, STATE2 -.endm - -.macro update4 m - update - pxor \m, STATE1 -.endm - -.macro update5 m - update - pxor \m, STATE0 -.endm - -.macro state_load - movdqu 0x00(STATEP), STATE0 - movdqu 0x10(STATEP), STATE1 - movdqu 0x20(STATEP), STATE2 - movdqu 0x30(STATEP), STATE3 - movdqu 0x40(STATEP), STATE4 - movdqu 0x50(STATEP), STATE5 -.endm - -.macro state_store s0 s1 s2 s3 s4 s5 - movdqu \s5, 0x00(STATEP) - movdqu \s0, 0x10(STATEP) - movdqu \s1, 0x20(STATEP) - movdqu \s2, 0x30(STATEP) - movdqu \s3, 0x40(STATEP) - movdqu \s4, 0x50(STATEP) -.endm - -.macro state_store0 - state_store STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro state_store1 - state_store STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro state_store2 - state_store STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro state_store3 - state_store STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 -.endm - -.macro state_store4 - state_store STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 -.endm - -.macro state_store5 - state_store STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 -.endm - -/* - * void crypto_aegis256_aesni_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_aegis256_aesni_init) - FRAME_BEGIN - - /* load key: */ - movdqa 0x00(%rsi), MSG - movdqa 0x10(%rsi), T1 - movdqa MSG, STATE4 - movdqa T1, STATE5 - - /* load IV: */ - movdqu 0x00(%rdx), T2 - movdqu 0x10(%rdx), T3 - pxor MSG, T2 - pxor T1, T3 - movdqa T2, STATE0 - movdqa T3, STATE1 - - /* load the constants: */ - movdqa .Laegis256_const_0, STATE3 - movdqa .Laegis256_const_1, STATE2 - pxor STATE3, STATE4 - pxor STATE2, STATE5 - - /* update 10 times with IV and KEY: */ - update0 MSG - update1 T1 - update2 T2 - update3 T3 - update4 MSG - update5 T1 - update0 T2 - update1 T3 - update2 MSG - update3 T1 - update4 T2 - update5 T3 - update0 MSG - update1 T1 - update2 T2 - update3 T3 - - state_store3 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_init) - -.macro ad_block a i - movdq\a (\i * 0x10)(SRC), MSG - update\i MSG - sub $0x10, LEN - cmp $0x10, LEN - jl .Lad_out_\i -.endm - -/* - * void crypto_aegis256_aesni_ad(void *state, unsigned int length, - * const void *data); - */ -ENTRY(crypto_aegis256_aesni_ad) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lad_out - - state_load - - mov SRC, %r8 - and $0xf, %r8 - jnz .Lad_u_loop - -.align 8 -.Lad_a_loop: - ad_block a 0 - ad_block a 1 - ad_block a 2 - ad_block a 3 - ad_block a 4 - ad_block a 5 - - add $0x60, SRC - jmp .Lad_a_loop - -.align 8 -.Lad_u_loop: - ad_block u 0 - ad_block u 1 - ad_block u 2 - ad_block u 3 - ad_block u 4 - ad_block u 5 - - add $0x60, SRC - jmp .Lad_u_loop - -.Lad_out_0: - state_store0 - FRAME_END - ret - -.Lad_out_1: - state_store1 - FRAME_END - ret - -.Lad_out_2: - state_store2 - FRAME_END - ret - -.Lad_out_3: - state_store3 - FRAME_END - ret - -.Lad_out_4: - state_store4 - FRAME_END - ret - -.Lad_out_5: - state_store5 - FRAME_END - ret - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_ad) - -.macro crypt m s0 s1 s2 s3 s4 s5 - pxor \s1, \m - pxor \s4, \m - pxor \s5, \m - movdqa \s2, T3 - pand \s3, T3 - pxor T3, \m -.endm - -.macro crypt0 m - crypt \m STATE0 STATE1 STATE2 STATE3 STATE4 STATE5 -.endm - -.macro crypt1 m - crypt \m STATE5 STATE0 STATE1 STATE2 STATE3 STATE4 -.endm - -.macro crypt2 m - crypt \m STATE4 STATE5 STATE0 STATE1 STATE2 STATE3 -.endm - -.macro crypt3 m - crypt \m STATE3 STATE4 STATE5 STATE0 STATE1 STATE2 -.endm - -.macro crypt4 m - crypt \m STATE2 STATE3 STATE4 STATE5 STATE0 STATE1 -.endm - -.macro crypt5 m - crypt \m STATE1 STATE2 STATE3 STATE4 STATE5 STATE0 -.endm - -.macro encrypt_block a i - movdq\a (\i * 0x10)(SRC), MSG - movdqa MSG, T0 - crypt\i T0 - movdq\a T0, (\i * 0x10)(DST) - - update\i MSG - - sub $0x10, LEN - cmp $0x10, LEN - jl .Lenc_out_\i -.endm - -.macro decrypt_block a i - movdq\a (\i * 0x10)(SRC), MSG - crypt\i MSG - movdq\a MSG, (\i * 0x10)(DST) - - update\i MSG - - sub $0x10, LEN - cmp $0x10, LEN - jl .Ldec_out_\i -.endm - -/* - * void crypto_aegis256_aesni_enc(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_enc) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Lenc_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xf, %r8 - jnz .Lenc_u_loop - -.align 8 -.Lenc_a_loop: - encrypt_block a 0 - encrypt_block a 1 - encrypt_block a 2 - encrypt_block a 3 - encrypt_block a 4 - encrypt_block a 5 - - add $0x60, SRC - add $0x60, DST - jmp .Lenc_a_loop - -.align 8 -.Lenc_u_loop: - encrypt_block u 0 - encrypt_block u 1 - encrypt_block u 2 - encrypt_block u 3 - encrypt_block u 4 - encrypt_block u 5 - - add $0x60, SRC - add $0x60, DST - jmp .Lenc_u_loop - -.Lenc_out_0: - state_store0 - FRAME_END - ret - -.Lenc_out_1: - state_store1 - FRAME_END - ret - -.Lenc_out_2: - state_store2 - FRAME_END - ret - -.Lenc_out_3: - state_store3 - FRAME_END - ret - -.Lenc_out_4: - state_store4 - FRAME_END - ret - -.Lenc_out_5: - state_store5 - FRAME_END - ret - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_enc) - -/* - * void crypto_aegis256_aesni_enc_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_enc_tail) - FRAME_BEGIN - - state_load - - /* encrypt message: */ - call __load_partial - - movdqa MSG, T0 - crypt0 T0 - - call __store_partial - - update0 MSG - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_enc_tail) - -/* - * void crypto_aegis256_aesni_dec(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_dec) - FRAME_BEGIN - - cmp $0x10, LEN - jb .Ldec_out - - state_load - - mov SRC, %r8 - or DST, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 8 -.Ldec_a_loop: - decrypt_block a 0 - decrypt_block a 1 - decrypt_block a 2 - decrypt_block a 3 - decrypt_block a 4 - decrypt_block a 5 - - add $0x60, SRC - add $0x60, DST - jmp .Ldec_a_loop - -.align 8 -.Ldec_u_loop: - decrypt_block u 0 - decrypt_block u 1 - decrypt_block u 2 - decrypt_block u 3 - decrypt_block u 4 - decrypt_block u 5 - - add $0x60, SRC - add $0x60, DST - jmp .Ldec_u_loop - -.Ldec_out_0: - state_store0 - FRAME_END - ret - -.Ldec_out_1: - state_store1 - FRAME_END - ret - -.Ldec_out_2: - state_store2 - FRAME_END - ret - -.Ldec_out_3: - state_store3 - FRAME_END - ret - -.Ldec_out_4: - state_store4 - FRAME_END - ret - -.Ldec_out_5: - state_store5 - FRAME_END - ret - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_dec) - -/* - * void crypto_aegis256_aesni_dec_tail(void *state, unsigned int length, - * const void *src, void *dst); - */ -ENTRY(crypto_aegis256_aesni_dec_tail) - FRAME_BEGIN - - state_load - - /* decrypt message: */ - call __load_partial - - crypt0 MSG - - movdqa MSG, T0 - call __store_partial - - /* mask with byte count: */ - movq LEN, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Laegis256_counter, T1 - pcmpgtb T1, T0 - pand T0, MSG - - update0 MSG - - state_store0 - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_dec_tail) - -/* - * void crypto_aegis256_aesni_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_aegis256_aesni_final) - FRAME_BEGIN - - state_load - - /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG - psllq $3, MSG /* multiply by 8 (to get bit count) */ - - pxor STATE3, MSG - - /* update state: */ - update0 MSG - update1 MSG - update2 MSG - update3 MSG - update4 MSG - update5 MSG - update0 MSG - - /* xor tag: */ - movdqu (%rsi), MSG - - pxor STATE0, MSG - pxor STATE1, MSG - pxor STATE2, MSG - pxor STATE3, MSG - pxor STATE4, MSG - pxor STATE5, MSG - - movdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_aegis256_aesni_final) diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c deleted file mode 100644 index f84da27171d3..000000000000 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ /dev/null @@ -1,293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The AEGIS-256 Authenticated-Encryption Algorithm - * Glue for AES-NI + SSE2 implementation - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <crypto/scatterwalk.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -#define AEGIS256_BLOCK_ALIGN 16 -#define AEGIS256_BLOCK_SIZE 16 -#define AEGIS256_NONCE_SIZE 32 -#define AEGIS256_STATE_BLOCKS 6 -#define AEGIS256_KEY_SIZE 32 -#define AEGIS256_MIN_AUTH_SIZE 8 -#define AEGIS256_MAX_AUTH_SIZE 16 - -asmlinkage void crypto_aegis256_aesni_init(void *state, void *key, void *iv); - -asmlinkage void crypto_aegis256_aesni_ad( - void *state, unsigned int length, const void *data); - -asmlinkage void crypto_aegis256_aesni_enc( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_dec( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_enc_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_dec_tail( - void *state, unsigned int length, const void *src, void *dst); - -asmlinkage void crypto_aegis256_aesni_final( - void *state, void *tag_xor, unsigned int cryptlen, - unsigned int assoclen); - -struct aegis_block { - u8 bytes[AEGIS256_BLOCK_SIZE] __aligned(AEGIS256_BLOCK_ALIGN); -}; - -struct aegis_state { - struct aegis_block blocks[AEGIS256_STATE_BLOCKS]; -}; - -struct aegis_ctx { - struct aegis_block key[AEGIS256_KEY_SIZE / AEGIS256_BLOCK_SIZE]; -}; - -struct aegis_crypt_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, unsigned int length, const void *src, - void *dst); - void (*crypt_tail)(void *state, unsigned int length, const void *src, - void *dst); -}; - -static void crypto_aegis256_aesni_process_ad( - struct aegis_state *state, struct scatterlist *sg_src, - unsigned int assoclen) -{ - struct scatter_walk walk; - struct aegis_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= AEGIS256_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = AEGIS256_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - crypto_aegis256_aesni_ad(state, - AEGIS256_BLOCK_SIZE, - buf.bytes); - pos = 0; - left -= fill; - src += fill; - } - - crypto_aegis256_aesni_ad(state, left, src); - - src += left & ~(AEGIS256_BLOCK_SIZE - 1); - left &= AEGIS256_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - pos += left; - assoclen -= size; - - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, AEGIS256_BLOCK_SIZE - pos); - crypto_aegis256_aesni_ad(state, AEGIS256_BLOCK_SIZE, buf.bytes); - } -} - -static void crypto_aegis256_aesni_process_crypt( - struct aegis_state *state, struct skcipher_walk *walk, - const struct aegis_crypt_ops *ops) -{ - while (walk->nbytes >= AEGIS256_BLOCK_SIZE) { - ops->crypt_blocks(state, - round_down(walk->nbytes, AEGIS256_BLOCK_SIZE), - walk->src.virt.addr, walk->dst.virt.addr); - skcipher_walk_done(walk, walk->nbytes % AEGIS256_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, - walk->dst.virt.addr); - skcipher_walk_done(walk, 0); - } -} - -static struct aegis_ctx *crypto_aegis256_aesni_ctx(struct crypto_aead *aead) -{ - u8 *ctx = crypto_aead_ctx(aead); - ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); - return (void *)ctx; -} - -static int crypto_aegis256_aesni_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(aead); - - if (keylen != AEGIS256_KEY_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key, key, AEGIS256_KEY_SIZE); - - return 0; -} - -static int crypto_aegis256_aesni_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - if (authsize > AEGIS256_MAX_AUTH_SIZE) - return -EINVAL; - if (authsize < AEGIS256_MIN_AUTH_SIZE) - return -EINVAL; - return 0; -} - -static void crypto_aegis256_aesni_crypt(struct aead_request *req, - struct aegis_block *tag_xor, - unsigned int cryptlen, - const struct aegis_crypt_ops *ops) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm); - struct skcipher_walk walk; - struct aegis_state state; - - ops->skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - crypto_aegis256_aesni_init(&state, ctx->key, req->iv); - crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen); - crypto_aegis256_aesni_process_crypt(&state, &walk, ops); - crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -static int crypto_aegis256_aesni_encrypt(struct aead_request *req) -{ - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = crypto_aegis256_aesni_enc, - .crypt_tail = crypto_aegis256_aesni_enc_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} - -static int crypto_aegis256_aesni_decrypt(struct aead_request *req) -{ - static const struct aegis_block zeros = {}; - - static const struct aegis_crypt_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = crypto_aegis256_aesni_dec, - .crypt_tail = crypto_aegis256_aesni_dec_tail, - }; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aegis_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_aegis256_aesni_crypt(req, &tag, cryptlen, &OPS); - - return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; -} - -static int crypto_aegis256_aesni_init_tfm(struct crypto_aead *aead) -{ - return 0; -} - -static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) -{ -} - -static struct aead_alg crypto_aegis256_aesni_alg = { - .setkey = crypto_aegis256_aesni_setkey, - .setauthsize = crypto_aegis256_aesni_setauthsize, - .encrypt = crypto_aegis256_aesni_encrypt, - .decrypt = crypto_aegis256_aesni_decrypt, - .init = crypto_aegis256_aesni_init_tfm, - .exit = crypto_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - .cra_priority = 400, - - .cra_name = "__aegis256", - .cra_driver_name = "__aegis256-aesni", - - .cra_module = THIS_MODULE, - } -}; - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_aegis256_aesni_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !boot_cpu_has(X86_FEATURE_AES) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_aegis256_aesni_alg, 1, - &simd_alg); -} - -static void __exit crypto_aegis256_aesni_module_exit(void) -{ - simd_unregister_aeads(&crypto_aegis256_aesni_alg, 1, &simd_alg); -} - -module_init(crypto_aegis256_aesni_module_init); -module_exit(crypto_aegis256_aesni_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("AEGIS-256 AEAD algorithm -- AESNI+SSE2 implementation"); -MODULE_ALIAS_CRYPTO("aegis256"); -MODULE_ALIAS_CRYPTO("aegis256-aesni"); diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S deleted file mode 100644 index 2849dbc59e11..000000000000 --- a/arch/x86/crypto/aes-i586-asm_32.S +++ /dev/null @@ -1,362 +0,0 @@ -// ------------------------------------------------------------------------- -// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK. -// All rights reserved. -// -// LICENSE TERMS -// -// The free distribution and use of this software in both source and binary -// form is allowed (with or without changes) provided that: -// -// 1. distributions of this source code include the above copyright -// notice, this list of conditions and the following disclaimer// -// -// 2. distributions in binary form include the above copyright -// notice, this list of conditions and the following disclaimer -// in the documentation and/or other associated materials// -// -// 3. the copyright holder's name is not used to endorse products -// built using this software without specific written permission. -// -// -// ALTERNATIVELY, provided that this notice is retained in full, this product -// may be distributed under the terms of the GNU General Public License (GPL), -// in which case the provisions of the GPL apply INSTEAD OF those given above. -// -// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org> -// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> - -// DISCLAIMER -// -// This software is provided 'as is' with no explicit or implied warranties -// in respect of its properties including, but not limited to, correctness -// and fitness for purpose. -// ------------------------------------------------------------------------- -// Issue Date: 29/07/2002 - -.file "aes-i586-asm.S" -.text - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) - -/* offsets to parameters with one register pushed onto stack */ -#define ctx 8 -#define out_blk 12 -#define in_blk 16 - -/* offsets in crypto_aes_ctx structure */ -#define klen (480) -#define ekey (0) -#define dkey (240) - -// register mapping for encrypt and decrypt subroutines - -#define r0 eax -#define r1 ebx -#define r2 ecx -#define r3 edx -#define r4 esi -#define r5 edi - -#define eaxl al -#define eaxh ah -#define ebxl bl -#define ebxh bh -#define ecxl cl -#define ecxh ch -#define edxl dl -#define edxh dh - -#define _h(reg) reg##h -#define h(reg) _h(reg) - -#define _l(reg) reg##l -#define l(reg) _l(reg) - -// This macro takes a 32-bit word representing a column and uses -// each of its four bytes to index into four tables of 256 32-bit -// words to obtain values that are then xored into the appropriate -// output registers r0, r1, r4 or r5. - -// Parameters: -// table table base address -// %1 out_state[0] -// %2 out_state[1] -// %3 out_state[2] -// %4 out_state[3] -// idx input register for the round (destroyed) -// tmp scratch register for the round -// sched key schedule - -#define do_col(table, a1,a2,a3,a4, idx, tmp) \ - movzx %l(idx),%tmp; \ - xor table(,%tmp,4),%a1; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+2*tlen(,%tmp,4),%a3; \ - xor table+3*tlen(,%idx,4),%a4; - -// initialise output registers from the key schedule -// NB1: original value of a3 is in idx on exit -// NB2: original values of a1,a2,a4 aren't used -#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \ - mov 0 sched,%a1; \ - movzx %l(idx),%tmp; \ - mov 12 sched,%a2; \ - xor table(,%tmp,4),%a1; \ - mov 4 sched,%a4; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+3*tlen(,%idx,4),%a4; \ - mov %a3,%idx; \ - mov 8 sched,%a3; \ - xor table+2*tlen(,%tmp,4),%a3; - -// initialise output registers from the key schedule -// NB1: original value of a3 is in idx on exit -// NB2: original values of a1,a2,a4 aren't used -#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \ - mov 0 sched,%a1; \ - movzx %l(idx),%tmp; \ - mov 4 sched,%a2; \ - xor table(,%tmp,4),%a1; \ - mov 12 sched,%a4; \ - movzx %h(idx),%tmp; \ - shr $16,%idx; \ - xor table+tlen(,%tmp,4),%a2; \ - movzx %l(idx),%tmp; \ - movzx %h(idx),%idx; \ - xor table+3*tlen(,%idx,4),%a4; \ - mov %a3,%idx; \ - mov 8 sched,%a3; \ - xor table+2*tlen(,%tmp,4),%a3; - - -// original Gladman had conditional saves to MMX regs. -#define save(a1, a2) \ - mov %a2,4*a1(%esp) - -#define restore(a1, a2) \ - mov 4*a2(%esp),%a1 - -// These macros perform a forward encryption cycle. They are entered with -// the first previous round column values in r0,r1,r4,r5 and -// exit with the final values in the same registers, using stack -// for temporary storage. - -// round column values -// on entry: r0,r1,r4,r5 -// on exit: r2,r1,r4,r5 -#define fwd_rnd1(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \ - do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \ - restore(r0,0); \ - do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \ - restore(r0,1); \ - do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */ - -// round column values -// on entry: r2,r1,r4,r5 -// on exit: r0,r1,r4,r5 -#define fwd_rnd2(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \ - do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \ - restore(r2,0); \ - do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \ - restore(r2,1); \ - do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */ - -// These macros performs an inverse encryption cycle. They are entered with -// the first previous round column values in r0,r1,r4,r5 and -// exit with the final values in the same registers, using stack -// for temporary storage - -// round column values -// on entry: r0,r1,r4,r5 -// on exit: r2,r1,r4,r5 -#define inv_rnd1(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \ - do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \ - restore(r0,0); \ - do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \ - restore(r0,1); \ - do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */ - -// round column values -// on entry: r2,r1,r4,r5 -// on exit: r0,r1,r4,r5 -#define inv_rnd2(arg, table) \ - save (0,r1); \ - save (1,r5); \ - \ - /* compute new column values */ \ - do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \ - do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \ - restore(r2,0); \ - do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \ - restore(r2,1); \ - do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */ - -// AES (Rijndael) Encryption Subroutine -/* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ - -.extern crypto_ft_tab -.extern crypto_fl_tab - -ENTRY(aes_enc_blk) - push %ebp - mov ctx(%esp),%ebp - -// CAUTION: the order and the values used in these assigns -// rely on the register mappings - -1: push %ebx - mov in_blk+4(%esp),%r2 - push %esi - mov klen(%ebp),%r3 // key size - push %edi -#if ekey != 0 - lea ekey(%ebp),%ebp // key pointer -#endif - -// input four columns and xor in first round key - - mov (%r2),%r0 - mov 4(%r2),%r1 - mov 8(%r2),%r4 - mov 12(%r2),%r5 - xor (%ebp),%r0 - xor 4(%ebp),%r1 - xor 8(%ebp),%r4 - xor 12(%ebp),%r5 - - sub $8,%esp // space for register saves on stack - add $16,%ebp // increment to next round key - cmp $24,%r3 - jb 4f // 10 rounds for 128-bit key - lea 32(%ebp),%ebp - je 3f // 12 rounds for 192-bit key - lea 32(%ebp),%ebp - -2: fwd_rnd1( -64(%ebp), crypto_ft_tab) // 14 rounds for 256-bit key - fwd_rnd2( -48(%ebp), crypto_ft_tab) -3: fwd_rnd1( -32(%ebp), crypto_ft_tab) // 12 rounds for 192-bit key - fwd_rnd2( -16(%ebp), crypto_ft_tab) -4: fwd_rnd1( (%ebp), crypto_ft_tab) // 10 rounds for 128-bit key - fwd_rnd2( +16(%ebp), crypto_ft_tab) - fwd_rnd1( +32(%ebp), crypto_ft_tab) - fwd_rnd2( +48(%ebp), crypto_ft_tab) - fwd_rnd1( +64(%ebp), crypto_ft_tab) - fwd_rnd2( +80(%ebp), crypto_ft_tab) - fwd_rnd1( +96(%ebp), crypto_ft_tab) - fwd_rnd2(+112(%ebp), crypto_ft_tab) - fwd_rnd1(+128(%ebp), crypto_ft_tab) - fwd_rnd2(+144(%ebp), crypto_fl_tab) // last round uses a different table - -// move final values to the output array. CAUTION: the -// order of these assigns rely on the register mappings - - add $8,%esp - mov out_blk+12(%esp),%ebp - mov %r5,12(%ebp) - pop %edi - mov %r4,8(%ebp) - pop %esi - mov %r1,4(%ebp) - pop %ebx - mov %r0,(%ebp) - pop %ebp - ret -ENDPROC(aes_enc_blk) - -// AES (Rijndael) Decryption Subroutine -/* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ - -.extern crypto_it_tab -.extern crypto_il_tab - -ENTRY(aes_dec_blk) - push %ebp - mov ctx(%esp),%ebp - -// CAUTION: the order and the values used in these assigns -// rely on the register mappings - -1: push %ebx - mov in_blk+4(%esp),%r2 - push %esi - mov klen(%ebp),%r3 // key size - push %edi -#if dkey != 0 - lea dkey(%ebp),%ebp // key pointer -#endif - -// input four columns and xor in first round key - - mov (%r2),%r0 - mov 4(%r2),%r1 - mov 8(%r2),%r4 - mov 12(%r2),%r5 - xor (%ebp),%r0 - xor 4(%ebp),%r1 - xor 8(%ebp),%r4 - xor 12(%ebp),%r5 - - sub $8,%esp // space for register saves on stack - add $16,%ebp // increment to next round key - cmp $24,%r3 - jb 4f // 10 rounds for 128-bit key - lea 32(%ebp),%ebp - je 3f // 12 rounds for 192-bit key - lea 32(%ebp),%ebp - -2: inv_rnd1( -64(%ebp), crypto_it_tab) // 14 rounds for 256-bit key - inv_rnd2( -48(%ebp), crypto_it_tab) -3: inv_rnd1( -32(%ebp), crypto_it_tab) // 12 rounds for 192-bit key - inv_rnd2( -16(%ebp), crypto_it_tab) -4: inv_rnd1( (%ebp), crypto_it_tab) // 10 rounds for 128-bit key - inv_rnd2( +16(%ebp), crypto_it_tab) - inv_rnd1( +32(%ebp), crypto_it_tab) - inv_rnd2( +48(%ebp), crypto_it_tab) - inv_rnd1( +64(%ebp), crypto_it_tab) - inv_rnd2( +80(%ebp), crypto_it_tab) - inv_rnd1( +96(%ebp), crypto_it_tab) - inv_rnd2(+112(%ebp), crypto_it_tab) - inv_rnd1(+128(%ebp), crypto_it_tab) - inv_rnd2(+144(%ebp), crypto_il_tab) // last round uses a different table - -// move final values to the output array. CAUTION: the -// order of these assigns rely on the register mappings - - add $8,%esp - mov out_blk+12(%esp),%ebp - mov %r5,12(%ebp) - pop %edi - mov %r4,8(%ebp) - pop %esi - mov %r1,4(%ebp) - pop %ebx - mov %r0,(%ebp) - pop %ebp - ret -ENDPROC(aes_dec_blk) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S deleted file mode 100644 index 8739cf7795de..000000000000 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ /dev/null @@ -1,185 +0,0 @@ -/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64 - * - * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de> - * - * License: - * This code can be distributed under the terms of the GNU General Public - * License (GPL) Version 2 provided that the above header down to and - * including this sentence is retained in full. - */ - -.extern crypto_ft_tab -.extern crypto_it_tab -.extern crypto_fl_tab -.extern crypto_il_tab - -.text - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> - -#define R1 %rax -#define R1E %eax -#define R1X %ax -#define R1H %ah -#define R1L %al -#define R2 %rbx -#define R2E %ebx -#define R2X %bx -#define R2H %bh -#define R2L %bl -#define R3 %rcx -#define R3E %ecx -#define R3X %cx -#define R3H %ch -#define R3L %cl -#define R4 %rdx -#define R4E %edx -#define R4X %dx -#define R4H %dh -#define R4L %dl -#define R5 %rsi -#define R5E %esi -#define R6 %rdi -#define R6E %edi -#define R7 %r9 /* don't use %rbp; it breaks stack traces */ -#define R7E %r9d -#define R8 %r8 -#define R10 %r10 -#define R11 %r11 - -#define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ - ENTRY(FUNC); \ - movq r1,r2; \ - leaq KEY+48(r8),r9; \ - movq r10,r11; \ - movl (r7),r5 ## E; \ - movl 4(r7),r1 ## E; \ - movl 8(r7),r6 ## E; \ - movl 12(r7),r7 ## E; \ - movl 480(r8),r10 ## E; \ - xorl -48(r9),r5 ## E; \ - xorl -44(r9),r1 ## E; \ - xorl -40(r9),r6 ## E; \ - xorl -36(r9),r7 ## E; \ - cmpl $24,r10 ## E; \ - jb B128; \ - leaq 32(r9),r9; \ - je B192; \ - leaq 32(r9),r9; - -#define epilogue(FUNC,r1,r2,r5,r6,r7,r8,r9) \ - movq r1,r2; \ - movl r5 ## E,(r9); \ - movl r6 ## E,4(r9); \ - movl r7 ## E,8(r9); \ - movl r8 ## E,12(r9); \ - ret; \ - ENDPROC(FUNC); - -#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ - movzbl r2 ## H,r5 ## E; \ - movzbl r2 ## L,r6 ## E; \ - movl TAB+1024(,r5,4),r5 ## E;\ - movw r4 ## X,r2 ## X; \ - movl TAB(,r6,4),r6 ## E; \ - roll $16,r2 ## E; \ - shrl $16,r4 ## E; \ - movzbl r4 ## L,r7 ## E; \ - movzbl r4 ## H,r4 ## E; \ - xorl OFFSET(r8),ra ## E; \ - xorl OFFSET+4(r8),rb ## E; \ - xorl TAB+3072(,r4,4),r5 ## E;\ - xorl TAB+2048(,r7,4),r6 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r4 ## E; \ - movl TAB+1024(,r4,4),r4 ## E;\ - movw r3 ## X,r1 ## X; \ - roll $16,r1 ## E; \ - shrl $16,r3 ## E; \ - xorl TAB(,r7,4),r5 ## E; \ - movzbl r3 ## L,r7 ## E; \ - movzbl r3 ## H,r3 ## E; \ - xorl TAB+3072(,r3,4),r4 ## E;\ - xorl TAB+2048(,r7,4),r5 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r3 ## E; \ - shrl $16,r1 ## E; \ - xorl TAB+3072(,r3,4),r6 ## E;\ - movl TAB+2048(,r7,4),r3 ## E;\ - movzbl r1 ## L,r7 ## E; \ - movzbl r1 ## H,r1 ## E; \ - xorl TAB+1024(,r1,4),r6 ## E;\ - xorl TAB(,r7,4),r3 ## E; \ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r7 ## E; \ - shrl $16,r2 ## E; \ - xorl TAB+3072(,r1,4),r3 ## E;\ - xorl TAB+2048(,r7,4),r4 ## E;\ - movzbl r2 ## H,r1 ## E; \ - movzbl r2 ## L,r2 ## E; \ - xorl OFFSET+8(r8),rc ## E; \ - xorl OFFSET+12(r8),rd ## E; \ - xorl TAB+1024(,r1,4),r3 ## E;\ - xorl TAB(,r2,4),r4 ## E; - -#define move_regs(r1,r2,r3,r4) \ - movl r3 ## E,r1 ## E; \ - movl r4 ## E,r2 ## E; - -#define entry(FUNC,KEY,B128,B192) \ - prologue(FUNC,KEY,B128,B192,R2,R8,R1,R3,R4,R6,R10,R5,R11) - -#define return(FUNC) epilogue(FUNC,R8,R2,R5,R6,R3,R4,R11) - -#define encrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define encrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) - -#define decrypt_round(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \ - move_regs(R1,R2,R5,R6) - -#define decrypt_final(TAB,OFFSET) \ - round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) - -/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_enc_blk,0,.Le128,.Le192) - encrypt_round(crypto_ft_tab,-96) - encrypt_round(crypto_ft_tab,-80) -.Le192: encrypt_round(crypto_ft_tab,-64) - encrypt_round(crypto_ft_tab,-48) -.Le128: encrypt_round(crypto_ft_tab,-32) - encrypt_round(crypto_ft_tab,-16) - encrypt_round(crypto_ft_tab, 0) - encrypt_round(crypto_ft_tab, 16) - encrypt_round(crypto_ft_tab, 32) - encrypt_round(crypto_ft_tab, 48) - encrypt_round(crypto_ft_tab, 64) - encrypt_round(crypto_ft_tab, 80) - encrypt_round(crypto_ft_tab, 96) - encrypt_final(crypto_fl_tab,112) - return(aes_enc_blk) - -/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - - entry(aes_dec_blk,240,.Ld128,.Ld192) - decrypt_round(crypto_it_tab,-96) - decrypt_round(crypto_it_tab,-80) -.Ld192: decrypt_round(crypto_it_tab,-64) - decrypt_round(crypto_it_tab,-48) -.Ld128: decrypt_round(crypto_it_tab,-32) - decrypt_round(crypto_it_tab,-16) - decrypt_round(crypto_it_tab, 0) - decrypt_round(crypto_it_tab, 16) - decrypt_round(crypto_it_tab, 32) - decrypt_round(crypto_it_tab, 48) - decrypt_round(crypto_it_tab, 64) - decrypt_round(crypto_it_tab, 80) - decrypt_round(crypto_it_tab, 96) - decrypt_final(crypto_il_tab,112) - return(aes_dec_blk) diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 9e9d819e8bc3..7b7dc05fa1a4 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -1,71 +1 @@ // SPDX-License-Identifier: GPL-2.0-only -/* - * Glue Code for the asm optimized version of the AES Cipher Algorithm - * - */ - -#include <linux/module.h> -#include <crypto/aes.h> -#include <asm/crypto/aes.h> - -asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); -asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); - -void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) -{ - aes_enc_blk(ctx, dst, src); -} -EXPORT_SYMBOL_GPL(crypto_aes_encrypt_x86); - -void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) -{ - aes_dec_blk(ctx, dst, src); -} -EXPORT_SYMBOL_GPL(crypto_aes_decrypt_x86); - -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_enc_blk(crypto_tfm_ctx(tfm), dst, src); -} - -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - aes_dec_blk(crypto_tfm_ctx(tfm), dst, src); -} - -static struct crypto_alg aes_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-asm", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx), - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = crypto_aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - -static int __init aes_init(void) -{ - return crypto_register_alg(&aes_alg); -} - -static void __exit aes_fini(void) -{ - crypto_unregister_alg(&aes_alg); -} - -module_init(aes_init); -module_exit(aes_fini); - -MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_CRYPTO("aes"); -MODULE_ALIAS_CRYPTO("aes-asm"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 73c0ccb009a0..3e707e81afdb 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -26,7 +26,6 @@ #include <crypto/gcm.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> -#include <asm/crypto/aes.h> #include <asm/simd.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> @@ -329,7 +328,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, } if (!crypto_simd_usable()) - err = crypto_aes_expand_key(ctx, in_key, key_len); + err = aes_expandkey(ctx, in_key, key_len); else { kernel_fpu_begin(); err = aesni_set_key(ctx, in_key, key_len); @@ -345,26 +344,26 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len); } -static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!crypto_simd_usable()) - crypto_aes_encrypt_x86(ctx, dst, src); - else { + if (!crypto_simd_usable()) { + aes_encrypt(ctx, dst, src); + } else { kernel_fpu_begin(); aesni_enc(ctx, dst, src); kernel_fpu_end(); } } -static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!crypto_simd_usable()) - crypto_aes_decrypt_x86(ctx, dst, src); - else { + if (!crypto_simd_usable()) { + aes_decrypt(ctx, dst, src); + } else { kernel_fpu_begin(); aesni_dec(ctx, dst, src); kernel_fpu_end(); @@ -610,7 +609,8 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&aesni_enc_xts, req, XTS_TWEAK_CAST(aesni_xts_tweak), aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + aes_ctx(ctx->raw_crypt_ctx), + false); } static int xts_decrypt(struct skcipher_request *req) @@ -621,32 +621,28 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&aesni_dec_xts, req, XTS_TWEAK_CAST(aesni_xts_tweak), aes_ctx(ctx->raw_tweak_ctx), - aes_ctx(ctx->raw_crypt_ctx)); + aes_ctx(ctx->raw_crypt_ctx), + true); } static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { - struct crypto_cipher *tfm; + struct crypto_aes_ctx ctx; int ret; - tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = crypto_cipher_setkey(tfm, key, key_len); + ret = aes_expandkey(&ctx, key, key_len); if (ret) - goto out_free_cipher; + return ret; /* Clear the data in the hash sub key container to zero.*/ /* We want to cipher all zeros to create the hash sub key. */ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); - crypto_cipher_encrypt_one(tfm, hash_subkey, hash_subkey); + aes_encrypt(&ctx, hash_subkey, hash_subkey); -out_free_cipher: - crypto_free_cipher(tfm); - return ret; + memzero_explicit(&ctx, sizeof(ctx)); + return 0; } static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, @@ -919,8 +915,8 @@ static struct crypto_alg aesni_cipher_alg = { .cia_min_keysize = AES_MIN_KEY_SIZE, .cia_max_keysize = AES_MAX_KEY_SIZE, .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt + .cia_encrypt = aesni_encrypt, + .cia_decrypt = aesni_decrypt } } }; diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index abf298c272dc..a4f00128ea55 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -182,7 +182,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&camellia_enc_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -192,7 +192,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&camellia_dec_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 0c22d84750a3..f28d282779b8 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -208,7 +208,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&camellia_enc_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -218,7 +218,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&camellia_dec_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg camellia_algs[] = { diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 645f8f16815c..a8a38fffb4a9 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -201,7 +201,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&cast6_enc_xts, req, XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -211,7 +211,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&cast6_dec_xts, req, XTS_TWEAK_CAST(__cast6_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg cast6_algs[] = { diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c index 968386c21ef4..89830e531350 100644 --- a/arch/x86/crypto/des3_ede_glue.c +++ b/arch/x86/crypto/des3_ede_glue.c @@ -19,8 +19,8 @@ #include <linux/types.h> struct des3_ede_x86_ctx { - u32 enc_expkey[DES3_EDE_EXPKEY_WORDS]; - u32 dec_expkey[DES3_EDE_EXPKEY_WORDS]; + struct des3_ede_ctx enc; + struct des3_ede_ctx dec; }; /* regular block cipher functions */ @@ -34,7 +34,7 @@ asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *enc_ctx = ctx->enc_expkey; + u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); } @@ -42,7 +42,7 @@ static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *dec_ctx = ctx->dec_expkey; + u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } @@ -50,7 +50,7 @@ static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *enc_ctx = ctx->enc_expkey; + u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src); } @@ -58,7 +58,7 @@ static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { - u32 *dec_ctx = ctx->dec_expkey; + u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); } @@ -122,7 +122,7 @@ static int ecb_encrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - return ecb_crypt(req, ctx->enc_expkey); + return ecb_crypt(req, ctx->enc.expkey); } static int ecb_decrypt(struct skcipher_request *req) @@ -130,7 +130,7 @@ static int ecb_decrypt(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); - return ecb_crypt(req, ctx->dec_expkey); + return ecb_crypt(req, ctx->dec.expkey); } static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, @@ -348,20 +348,28 @@ static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, u32 i, j, tmp; int err; - /* Generate encryption context using generic implementation. */ - err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen); - if (err < 0) + err = des3_ede_expand_key(&ctx->enc, key, keylen); + if (err == -ENOKEY) { + if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) + err = -EINVAL; + else + err = 0; + } + + if (err) { + memset(ctx, 0, sizeof(*ctx)); return err; + } /* Fix encryption context for this implementation and form decryption * context. */ j = DES3_EDE_EXPKEY_WORDS - 2; for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { - tmp = ror32(ctx->enc_expkey[i + 1], 4); - ctx->enc_expkey[i + 1] = tmp; + tmp = ror32(ctx->enc.expkey[i + 1], 4); + ctx->enc.expkey[i + 1] = tmp; - ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0]; - ctx->dec_expkey[j + 1] = tmp; + ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0]; + ctx->dec.expkey[j + 1] = tmp; } return 0; diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index ac76fe88ac4f..04d72a5a8ce9 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -357,6 +357,5 @@ module_init(ghash_pclmulqdqni_mod_init); module_exit(ghash_pclmulqdqni_mod_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " - "accelerated by PCLMULQDQ-NI"); +MODULE_DESCRIPTION("GHASH hash function, accelerated by PCLMULQDQ-NI"); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 901551445387..d15b99397480 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -14,6 +14,7 @@ #include <crypto/b128ops.h> #include <crypto/gf128mul.h> #include <crypto/internal/skcipher.h> +#include <crypto/scatterwalk.h> #include <crypto/xts.h> #include <asm/crypto/glue_helper.h> @@ -259,17 +260,36 @@ done: int glue_xts_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req, common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx) + void *crypt_ctx, bool decrypt) { + const bool cts = (req->cryptlen % XTS_BLOCK_SIZE); const unsigned int bsize = 128 / 8; + struct skcipher_request subreq; struct skcipher_walk walk; bool fpu_enabled = false; - unsigned int nbytes; + unsigned int nbytes, tail; int err; + if (req->cryptlen < XTS_BLOCK_SIZE) + return -EINVAL; + + if (unlikely(cts)) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + + tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE; + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + crypto_skcipher_get_flags(tfm), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + req->cryptlen - tail, req->iv); + req = &subreq; + } + err = skcipher_walk_virt(&walk, req, false); nbytes = walk.nbytes; - if (!nbytes) + if (err) return err; /* set minimum length to bsize, for tweak_fn */ @@ -287,6 +307,47 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, nbytes = walk.nbytes; } + if (unlikely(cts)) { + u8 *next_tweak, *final_tweak = req->iv; + struct scatterlist *src, *dst; + struct scatterlist s[2], d[2]; + le128 b[2]; + + dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(d, req->dst, req->cryptlen); + + if (decrypt) { + next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE); + gf128mul_x_ble(b, b); + } else { + next_tweak = req->iv; + } + + skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE, + next_tweak); + + err = skcipher_walk_virt(&walk, req, false) ?: + skcipher_walk_done(&walk, + __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); + if (err) + goto out; + + scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0); + memcpy(b + 1, b, tail - XTS_BLOCK_SIZE); + scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE, + tail - XTS_BLOCK_SIZE, 0); + scatterwalk_map_and_copy(b, dst, 0, tail, 1); + + skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE, + final_tweak); + + err = skcipher_walk_virt(&walk, req, false) ?: + skcipher_walk_done(&walk, + __glue_xts_req_128bit(gctx, crypt_ctx, &walk)); + } + +out: glue_fpu_end(fpu_enabled); return err; diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S deleted file mode 100644 index 5413fee33481..000000000000 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ /dev/null @@ -1,619 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AVX2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %ymm0 -#define STATE0_LOW %xmm0 -#define STATE1 %ymm1 -#define STATE2 %ymm2 -#define STATE3 %ymm3 -#define STATE4 %ymm4 -#define KEY %ymm5 -#define MSG %ymm5 -#define MSG_LOW %xmm5 -#define T0 %ymm6 -#define T0_LOW %xmm6 -#define T1 %ymm7 - -.section .rodata.cst32.morus1280_const, "aM", @progbits, 32 -.align 32 -.Lmorus1280_const: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32 -.align 32 -.Lmorus1280_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro morus1280_round s0, s1, s2, s3, s4, b, w - vpand \s1, \s2, T0 - vpxor T0, \s0, \s0 - vpxor \s3, \s0, \s0 - vpsllq $\b, \s0, T0 - vpsrlq $(64 - \b), \s0, \s0 - vpxor T0, \s0, \s0 - vpermq $\w, \s3, \s3 -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - vpxor MSG, STATE1, STATE1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - vpxor MSG, STATE2, STATE2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - vpxor MSG, STATE3, STATE3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - vpxor MSG, STATE4, STATE4 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1 - morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2 - morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3 - morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2, 7, MASK2 - morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3, 4, MASK1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - vpxor MSG, MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LOW - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pshufd $MASK2, MSG_LOW, MSG_LOW - pinsrq $0, (%r8), MSG_LOW - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - vpermq $MASK2, MSG, MSG - movdqu (%rsi), MSG_LOW - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LOW, (%r9) - vpermq $MASK2, T0, T0 - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LOW, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - pextrq $1, T0_LOW, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_avx2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_avx2_init) - FRAME_BEGIN - - /* load IV: */ - vpxor STATE0, STATE0, STATE0 - movdqu (%rdx), STATE0_LOW - /* load key: */ - vmovdqu (%rsi), KEY - vmovdqa KEY, STATE1 - /* load all ones: */ - vpcmpeqd STATE2, STATE2, STATE2 - /* load all zeros: */ - vpxor STATE3, STATE3, STATE3 - /* load the constant: */ - vmovdqa .Lmorus1280_const, STATE4 - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - vpxor KEY, STATE1, STATE1 - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_init) - -/* - * void crypto_morus1280_avx2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - and $0x1F, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - vmovdqa (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - vmovdqu (%rsi), MSG - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_ad) - -/* - * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - vmovdqa (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqa T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - vmovdqu (%rsi), MSG - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - vmovdqu T0, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc) - -/* - * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - vmovdqa MSG, T0 - vpxor STATE0, T0, T0 - vpermq $MASK3, STATE1, T1 - vpxor T1, T0, T0 - vpand STATE2, STATE3, T1 - vpxor T1, T0, T0 - - call __store_partial - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_enc_tail) - -/* - * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0x1F, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - vmovdqa (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - vmovdqu (%rsi), MSG - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec) - -/* - * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_avx2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LOW - vpbroadcastb T0_LOW, T0 - vmovdqa .Lmorus1280_counter, T1 - vpcmpgtb T1, T0, T0 - vpand T0, MSG, MSG - - call __morus1280_update - - /* store the state: */ - vmovdqu STATE0, (0 * 32)(%rdi) - vmovdqu STATE1, (1 * 32)(%rdi) - vmovdqu STATE2, (2 * 32)(%rdi) - vmovdqu STATE3, (3 * 32)(%rdi) - vmovdqu STATE4, (4 * 32)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_dec_tail) - -/* - * void crypto_morus1280_avx2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_avx2_final) - FRAME_BEGIN - - /* load the state: */ - vmovdqu (0 * 32)(%rdi), STATE0 - vmovdqu (1 * 32)(%rdi), STATE1 - vmovdqu (2 * 32)(%rdi), STATE2 - vmovdqu (3 * 32)(%rdi), STATE3 - vmovdqu (4 * 32)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - vpxor STATE0, STATE4, STATE4 - - /* prepare length block: */ - vpxor MSG, MSG, MSG - vpinsrq $0, %rdx, MSG_LOW, MSG_LOW - vpinsrq $1, %rcx, MSG_LOW, MSG_LOW - vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - vmovdqu (%rsi), MSG - - vpxor STATE0, MSG, MSG - vpermq $MASK3, STATE1, T0 - vpxor T0, MSG, MSG - vpand STATE2, STATE3, T0 - vpxor T0, MSG, MSG - vmovdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_avx2_final) diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c deleted file mode 100644 index 2d000d66ba4c..000000000000 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for AVX2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus1280_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_avx2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_AVX2) || - !boot_cpu_has(X86_FEATURE_OSXSAVE) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_avx2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_avx2_module_init); -module_exit(crypto_morus1280_avx2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-avx2"); diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S deleted file mode 100644 index 0eece772866b..000000000000 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ /dev/null @@ -1,893 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-1280 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) - -#define STATE0_LO %xmm0 -#define STATE0_HI %xmm1 -#define STATE1_LO %xmm2 -#define STATE1_HI %xmm3 -#define STATE2_LO %xmm4 -#define STATE2_HI %xmm5 -#define STATE3_LO %xmm6 -#define STATE3_HI %xmm7 -#define STATE4_LO %xmm8 -#define STATE4_HI %xmm9 -#define KEY_LO %xmm10 -#define KEY_HI %xmm11 -#define MSG_LO %xmm10 -#define MSG_HI %xmm11 -#define T0_LO %xmm12 -#define T0_HI %xmm13 -#define T1_LO %xmm14 -#define T1_HI %xmm15 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 16 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter_0: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -.Lmorus640_counter_1: - .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 - .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f - -.text - -.macro rol1 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * HI_0 | LO_1 || LO_0 | HI_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \hi, T0_LO - punpcklqdq \lo, T0_LO - punpckhqdq \hi, \lo - movdqa \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro rol2 hi, lo - movdqa \lo, T0_LO - movdqa \hi, \lo - movdqa T0_LO, \hi -.endm - -.macro rol3 hi, lo - /* - * HI_1 | HI_0 || LO_1 | LO_0 - * ==> - * HI_0 | HI_1 || LO_1 | LO_0 - * ==> - * LO_0 | HI_1 || HI_0 | LO_1 - */ - pshufd $MASK2, \hi, \hi - movdqa \lo, T0_LO - punpckhqdq \hi, T0_LO - punpcklqdq \lo, \hi - movdqa T0_LO, \lo -.endm - -.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w - movdqa \s1_l, T0_LO - pand \s2_l, T0_LO - pxor T0_LO, \s0_l - - movdqa \s1_h, T0_LO - pand \s2_h, T0_LO - pxor T0_LO, \s0_h - - pxor \s3_l, \s0_l - pxor \s3_h, \s0_h - - movdqa \s0_l, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_l - pxor T0_LO, \s0_l - - movdqa \s0_h, T0_LO - psllq $\b, T0_LO - psrlq $(64 - \b), \s0_h - pxor T0_LO, \s0_h - - \w \s3_h, \s3_l -.endm - -/* - * __morus1280_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - pxor MSG_LO, STATE1_LO - pxor MSG_HI, STATE1_HI - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - pxor MSG_LO, STATE2_LO - pxor MSG_HI, STATE2_HI - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - pxor MSG_LO, STATE3_LO - pxor MSG_HI, STATE3_HI - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - pxor MSG_LO, STATE4_LO - pxor MSG_HI, STATE4_HI - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update) - -/* - * __morus1280_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus1280_update_zero: - morus1280_round \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - 13, rol1 - morus1280_round \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - 46, rol2 - morus1280_round \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - 38, rol3 - morus1280_round \ - STATE3_LO, STATE3_HI, \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - 7, rol2 - morus1280_round \ - STATE4_LO, STATE4_HI, \ - STATE0_LO, STATE0_HI, \ - STATE1_LO, STATE1_HI, \ - STATE2_LO, STATE2_HI, \ - STATE3_LO, STATE3_HI, \ - 4, rol1 - ret -ENDPROC(__morus1280_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG_LO, MSG_LO - pxor MSG_HI, MSG_HI - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG_LO - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG_LO - movq (%r8), T0_LO - pxor T0_LO, MSG_LO - -.Lld_partial_8: - mov %rcx, %r8 - and $0x10, %r8 - jz .Lld_partial_16 - - movdqa MSG_LO, MSG_HI - movdqu (%rsi), MSG_LO - -.Lld_partial_16: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - cmp $16, %r8 - jl .Lst_partial_16 - - movdqu T0_LO, (%r9) - movdqa T0_HI, T0_LO - - sub $16, %r8 - add $16, %r9 - -.Lst_partial_16: - movq T0_LO, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0_LO - movq T0_LO, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus1280_sse2_init(void *state, const void *key, - * const void *iv); - */ -ENTRY(crypto_morus1280_sse2_init) - FRAME_BEGIN - - /* load IV: */ - pxor STATE0_HI, STATE0_HI - movdqu (%rdx), STATE0_LO - /* load key: */ - movdqu 0(%rsi), KEY_LO - movdqu 16(%rsi), KEY_HI - movdqa KEY_LO, STATE1_LO - movdqa KEY_HI, STATE1_HI - /* load all ones: */ - pcmpeqd STATE2_LO, STATE2_LO - pcmpeqd STATE2_HI, STATE2_HI - /* load all zeros: */ - pxor STATE3_LO, STATE3_LO - pxor STATE3_HI, STATE3_HI - /* load the constant: */ - movdqa .Lmorus640_const_0, STATE4_LO - movdqa .Lmorus640_const_1, STATE4_HI - - /* update 16 times with zero: */ - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - call __morus1280_update_zero - - /* xor-in the key again after updates: */ - pxor KEY_LO, STATE1_LO - pxor KEY_HI, STATE1_HI - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_init) - -/* - * void crypto_morus1280_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_ad) - FRAME_BEGIN - - cmp $32, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - call __morus1280_update - sub $32, %rdx - add $32, %rsi - cmp $32, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_ad) - -/* - * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc) - FRAME_BEGIN - - cmp $32, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqa T0_LO, 0(%rdx) - movdqa T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - movdqu T0_LO, 0(%rdx) - movdqu T0_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc) - -/* - * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* encrypt message: */ - call __load_partial - - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - pxor STATE0_LO, T0_LO - pxor STATE0_HI, T0_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, T0_LO - pxor T1_HI, T0_HI - - call __store_partial - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_enc_tail) - -/* - * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec) - FRAME_BEGIN - - cmp $32, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa 0(%rsi), MSG_LO - movdqa 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, 0(%rdx) - movdqa MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqu MSG_LO, 0(%rdx) - movdqu MSG_HI, 16(%rdx) - - call __morus1280_update - sub $32, %rcx - add $32, %rsi - add $32, %rdx - cmp $32, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec) - -/* - * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus1280_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* decrypt message: */ - call __load_partial - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T1_LO - movdqa STATE1_HI, T1_HI - rol3 T1_HI, T1_LO - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa STATE2_LO, T1_LO - movdqa STATE2_HI, T1_HI - pand STATE3_LO, T1_LO - pand STATE3_HI, T1_HI - pxor T1_LO, MSG_LO - pxor T1_HI, MSG_HI - movdqa MSG_LO, T0_LO - movdqa MSG_HI, T0_HI - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - punpcklbw T0_LO, T0_LO - movdqa T0_LO, T0_HI - movdqa .Lmorus640_counter_0, T1_LO - movdqa .Lmorus640_counter_1, T1_HI - pcmpgtb T1_LO, T0_LO - pcmpgtb T1_HI, T0_HI - pand T0_LO, MSG_LO - pand T0_HI, MSG_HI - - call __morus1280_update - - /* store the state: */ - movdqu STATE0_LO, (0 * 16)(%rdi) - movdqu STATE0_HI, (1 * 16)(%rdi) - movdqu STATE1_LO, (2 * 16)(%rdi) - movdqu STATE1_HI, (3 * 16)(%rdi) - movdqu STATE2_LO, (4 * 16)(%rdi) - movdqu STATE2_HI, (5 * 16)(%rdi) - movdqu STATE3_LO, (6 * 16)(%rdi) - movdqu STATE3_HI, (7 * 16)(%rdi) - movdqu STATE4_LO, (8 * 16)(%rdi) - movdqu STATE4_HI, (9 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_dec_tail) - -/* - * void crypto_morus1280_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus1280_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0_LO - movdqu (1 * 16)(%rdi), STATE0_HI - movdqu (2 * 16)(%rdi), STATE1_LO - movdqu (3 * 16)(%rdi), STATE1_HI - movdqu (4 * 16)(%rdi), STATE2_LO - movdqu (5 * 16)(%rdi), STATE2_HI - movdqu (6 * 16)(%rdi), STATE3_LO - movdqu (7 * 16)(%rdi), STATE3_HI - movdqu (8 * 16)(%rdi), STATE4_LO - movdqu (9 * 16)(%rdi), STATE4_HI - - /* xor state[0] into state[4]: */ - pxor STATE0_LO, STATE4_LO - pxor STATE0_HI, STATE4_HI - - /* prepare length block: */ - movq %rdx, MSG_LO - movq %rcx, T0_LO - pslldq $8, T0_LO - pxor T0_LO, MSG_LO - psllq $3, MSG_LO /* multiply by 8 (to get bit count) */ - pxor MSG_HI, MSG_HI - - /* update state: */ - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - call __morus1280_update - - /* xor tag: */ - movdqu 0(%rsi), MSG_LO - movdqu 16(%rsi), MSG_HI - - pxor STATE0_LO, MSG_LO - pxor STATE0_HI, MSG_HI - movdqa STATE1_LO, T0_LO - movdqa STATE1_HI, T0_HI - rol3 T0_HI, T0_LO - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - movdqa STATE2_LO, T0_LO - movdqa STATE2_HI, T0_HI - pand STATE3_LO, T0_LO - pand STATE3_HI, T0_HI - pxor T0_LO, MSG_LO - pxor T0_HI, MSG_HI - - movdqu MSG_LO, 0(%rsi) - movdqu MSG_HI, 16(%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus1280_sse2_final) diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c deleted file mode 100644 index aada9d774293..000000000000 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus1280_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus1280_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus1280_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus1280_sse2_module_init); -module_exit(crypto_morus1280_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus1280"); -MODULE_ALIAS_CRYPTO("morus1280-sse2"); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c deleted file mode 100644 index ffbde8b22838..000000000000 --- a/arch/x86/crypto/morus1280_glue.c +++ /dev/null @@ -1,205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-1280 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/skcipher.h> -#include <crypto/morus1280_glue.h> -#include <crypto/scatterwalk.h> -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/scatterlist.h> -#include <asm/fpu/api.h> - -struct morus1280_state { - struct morus1280_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus1280_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus1280_glue_process_ad( - struct morus1280_state *state, - const struct morus1280_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus1280_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS1280_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS1280_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS1280_BLOCK_SIZE - 1); - left &= MORUS1280_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS1280_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS1280_BLOCK_SIZE); - } -} - -static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state, - struct morus1280_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS1280_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, - MORUS1280_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen == MORUS1280_BLOCK_SIZE) { - memcpy(ctx->key.bytes, key, MORUS1280_BLOCK_SIZE); - } else if (keylen == MORUS1280_BLOCK_SIZE / 2) { - memcpy(ctx->key.bytes, key, keylen); - memcpy(ctx->key.bytes + keylen, key, keylen); - } else { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setkey); - -int crypto_morus1280_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_setauthsize); - -static void crypto_morus1280_glue_crypt(struct aead_request *req, - struct morus1280_ops ops, - unsigned int cryptlen, - struct morus1280_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus1280_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus1280_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_encrypt); - -int crypto_morus1280_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS1280_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus1280_ctx *ctx = crypto_aead_ctx(tfm); - struct morus1280_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus1280_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus1280_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_decrypt); - -void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, - const struct morus1280_glue_ops *ops) -{ - struct morus1280_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S deleted file mode 100644 index a60891101bbd..000000000000 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ /dev/null @@ -1,612 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * SSE2 implementation of MORUS-640 - * - * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -#define SHUFFLE_MASK(i0, i1, i2, i3) \ - (i0 | (i1 << 2) | (i2 << 4) | (i3 << 6)) - -#define MASK1 SHUFFLE_MASK(3, 0, 1, 2) -#define MASK2 SHUFFLE_MASK(2, 3, 0, 1) -#define MASK3 SHUFFLE_MASK(1, 2, 3, 0) - -#define STATE0 %xmm0 -#define STATE1 %xmm1 -#define STATE2 %xmm2 -#define STATE3 %xmm3 -#define STATE4 %xmm4 -#define KEY %xmm5 -#define MSG %xmm5 -#define T0 %xmm6 -#define T1 %xmm7 - -.section .rodata.cst16.morus640_const, "aM", @progbits, 32 -.align 16 -.Lmorus640_const_0: - .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d - .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 -.Lmorus640_const_1: - .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 - .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd - -.section .rodata.cst16.morus640_counter, "aM", @progbits, 16 -.align 16 -.Lmorus640_counter: - .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 - .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f - -.text - -.macro morus640_round s0, s1, s2, s3, s4, b, w - movdqa \s1, T0 - pand \s2, T0 - pxor T0, \s0 - pxor \s3, \s0 - movdqa \s0, T0 - pslld $\b, T0 - psrld $(32 - \b), \s0 - pxor T0, \s0 - pshufd $\w, \s3, \s3 -.endm - -/* - * __morus640_update: internal ABI - * input: - * STATE[0-4] - input state - * MSG - message block - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - pxor MSG, STATE1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - pxor MSG, STATE2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - pxor MSG, STATE3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - pxor MSG, STATE4 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update) - - -/* - * __morus640_update_zero: internal ABI - * input: - * STATE[0-4] - input state - * output: - * STATE[0-4] - output state - * changed: - * T0 - */ -__morus640_update_zero: - morus640_round STATE0, STATE1, STATE2, STATE3, STATE4, 5, MASK1 - morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2 - morus640_round STATE2, STATE3, STATE4, STATE0, STATE1, 7, MASK3 - morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2 - morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1 - ret -ENDPROC(__morus640_update_zero) - -/* - * __load_partial: internal ABI - * input: - * %rsi - src - * %rcx - bytes - * output: - * MSG - message block - * changed: - * T0 - * %r8 - * %r9 - */ -__load_partial: - xor %r9d, %r9d - pxor MSG, MSG - - mov %rcx, %r8 - and $0x1, %r8 - jz .Lld_partial_1 - - mov %rcx, %r8 - and $0x1E, %r8 - add %rsi, %r8 - mov (%r8), %r9b - -.Lld_partial_1: - mov %rcx, %r8 - and $0x2, %r8 - jz .Lld_partial_2 - - mov %rcx, %r8 - and $0x1C, %r8 - add %rsi, %r8 - shl $16, %r9 - mov (%r8), %r9w - -.Lld_partial_2: - mov %rcx, %r8 - and $0x4, %r8 - jz .Lld_partial_4 - - mov %rcx, %r8 - and $0x18, %r8 - add %rsi, %r8 - shl $32, %r9 - mov (%r8), %r8d - xor %r8, %r9 - -.Lld_partial_4: - movq %r9, MSG - - mov %rcx, %r8 - and $0x8, %r8 - jz .Lld_partial_8 - - mov %rcx, %r8 - and $0x10, %r8 - add %rsi, %r8 - pslldq $8, MSG - movq (%r8), T0 - pxor T0, MSG - -.Lld_partial_8: - ret -ENDPROC(__load_partial) - -/* - * __store_partial: internal ABI - * input: - * %rdx - dst - * %rcx - bytes - * output: - * T0 - message block - * changed: - * %r8 - * %r9 - * %r10 - */ -__store_partial: - mov %rcx, %r8 - mov %rdx, %r9 - - movq T0, %r10 - - cmp $8, %r8 - jl .Lst_partial_8 - - mov %r10, (%r9) - psrldq $8, T0 - movq T0, %r10 - - sub $8, %r8 - add $8, %r9 - -.Lst_partial_8: - cmp $4, %r8 - jl .Lst_partial_4 - - mov %r10d, (%r9) - shr $32, %r10 - - sub $4, %r8 - add $4, %r9 - -.Lst_partial_4: - cmp $2, %r8 - jl .Lst_partial_2 - - mov %r10w, (%r9) - shr $16, %r10 - - sub $2, %r8 - add $2, %r9 - -.Lst_partial_2: - cmp $1, %r8 - jl .Lst_partial_1 - - mov %r10b, (%r9) - -.Lst_partial_1: - ret -ENDPROC(__store_partial) - -/* - * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv); - */ -ENTRY(crypto_morus640_sse2_init) - FRAME_BEGIN - - /* load IV: */ - movdqu (%rdx), STATE0 - /* load key: */ - movdqu (%rsi), KEY - movdqa KEY, STATE1 - /* load all ones: */ - pcmpeqd STATE2, STATE2 - /* load the constants: */ - movdqa .Lmorus640_const_0, STATE3 - movdqa .Lmorus640_const_1, STATE4 - - /* update 16 times with zero: */ - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - call __morus640_update_zero - - /* xor-in the key again after updates: */ - pxor KEY, STATE1 - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_init) - -/* - * void crypto_morus640_sse2_ad(void *state, const void *data, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_ad) - FRAME_BEGIN - - cmp $16, %rdx - jb .Lad_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - and $0xF, %r8 - jnz .Lad_u_loop - -.align 4 -.Lad_a_loop: - movdqa (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_a_loop - - jmp .Lad_cont -.align 4 -.Lad_u_loop: - movdqu (%rsi), MSG - call __morus640_update - sub $16, %rdx - add $16, %rsi - cmp $16, %rdx - jge .Lad_u_loop - -.Lad_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lad_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_ad) - -/* - * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc) - FRAME_BEGIN - - cmp $16, %rcx - jb .Lenc_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Lenc_u_loop - -.align 4 -.Lenc_a_loop: - movdqa (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqa T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_a_loop - - jmp .Lenc_cont -.align 4 -.Lenc_u_loop: - movdqu (%rsi), MSG - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - movdqu T0, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Lenc_u_loop - -.Lenc_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Lenc_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc) - -/* - * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_enc_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* encrypt message: */ - call __load_partial - - movdqa MSG, T0 - pxor STATE0, T0 - pshufd $MASK3, STATE1, T1 - pxor T1, T0 - movdqa STATE2, T1 - pand STATE3, T1 - pxor T1, T0 - - call __store_partial - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_enc_tail) - -/* - * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec) - FRAME_BEGIN - - cmp $16, %rcx - jb .Ldec_out - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - mov %rsi, %r8 - or %rdx, %r8 - and $0xF, %r8 - jnz .Ldec_u_loop - -.align 4 -.Ldec_a_loop: - movdqa (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_a_loop - - jmp .Ldec_cont -.align 4 -.Ldec_u_loop: - movdqu (%rsi), MSG - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqu MSG, (%rdx) - - call __morus640_update - sub $16, %rcx - add $16, %rsi - add $16, %rdx - cmp $16, %rcx - jge .Ldec_u_loop - -.Ldec_cont: - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - -.Ldec_out: - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec) - -/* - * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst, - * unsigned int length); - */ -ENTRY(crypto_morus640_sse2_dec_tail) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* decrypt message: */ - call __load_partial - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - movdqa MSG, T0 - - call __store_partial - - /* mask with byte count: */ - movq %rcx, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - punpcklbw T0, T0 - movdqa .Lmorus640_counter, T1 - pcmpgtb T1, T0 - pand T0, MSG - - call __morus640_update - - /* store the state: */ - movdqu STATE0, (0 * 16)(%rdi) - movdqu STATE1, (1 * 16)(%rdi) - movdqu STATE2, (2 * 16)(%rdi) - movdqu STATE3, (3 * 16)(%rdi) - movdqu STATE4, (4 * 16)(%rdi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_dec_tail) - -/* - * void crypto_morus640_sse2_final(void *state, void *tag_xor, - * u64 assoclen, u64 cryptlen); - */ -ENTRY(crypto_morus640_sse2_final) - FRAME_BEGIN - - /* load the state: */ - movdqu (0 * 16)(%rdi), STATE0 - movdqu (1 * 16)(%rdi), STATE1 - movdqu (2 * 16)(%rdi), STATE2 - movdqu (3 * 16)(%rdi), STATE3 - movdqu (4 * 16)(%rdi), STATE4 - - /* xor state[0] into state[4]: */ - pxor STATE0, STATE4 - - /* prepare length block: */ - movq %rdx, MSG - movq %rcx, T0 - pslldq $8, T0 - pxor T0, MSG - psllq $3, MSG /* multiply by 8 (to get bit count) */ - - /* update state: */ - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - call __morus640_update - - /* xor tag: */ - movdqu (%rsi), MSG - - pxor STATE0, MSG - pshufd $MASK3, STATE1, T0 - pxor T0, MSG - movdqa STATE2, T0 - pand STATE3, T0 - pxor T0, MSG - - movdqu MSG, (%rsi) - - FRAME_END - ret -ENDPROC(crypto_morus640_sse2_final) diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c deleted file mode 100644 index 8ef68134aef4..000000000000 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Glue for SSE2 implementation - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> -#include <crypto/morus640_glue.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/cpu_device_id.h> - -asmlinkage void crypto_morus640_sse2_init(void *state, const void *key, - const void *iv); -asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data, - unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src, - void *dst, unsigned int length); -asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, - void *dst, unsigned int length); - -asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, - u64 assoclen, u64 cryptlen); - -MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400); - -static struct simd_aead_alg *simd_alg; - -static int __init crypto_morus640_sse2_module_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_XMM2) || - !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) - return -ENODEV; - - return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1, - &simd_alg); -} - -static void __exit crypto_morus640_sse2_module_exit(void) -{ - simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg); -} - -module_init(crypto_morus640_sse2_module_init); -module_exit(crypto_morus640_sse2_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation"); -MODULE_ALIAS_CRYPTO("morus640"); -MODULE_ALIAS_CRYPTO("morus640-sse2"); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c deleted file mode 100644 index d8b5fd6cef29..000000000000 --- a/arch/x86/crypto/morus640_glue.c +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * The MORUS-640 Authenticated-Encryption Algorithm - * Common x86 SIMD glue skeleton - * - * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com> - * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - */ - -#include <crypto/internal/aead.h> -#include <crypto/internal/skcipher.h> -#include <crypto/morus640_glue.h> -#include <crypto/scatterwalk.h> -#include <linux/err.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/scatterlist.h> -#include <asm/fpu/api.h> - -struct morus640_state { - struct morus640_block s[MORUS_STATE_BLOCKS]; -}; - -struct morus640_ops { - int (*skcipher_walk_init)(struct skcipher_walk *walk, - struct aead_request *req, bool atomic); - - void (*crypt_blocks)(void *state, const void *src, void *dst, - unsigned int length); - void (*crypt_tail)(void *state, const void *src, void *dst, - unsigned int length); -}; - -static void crypto_morus640_glue_process_ad( - struct morus640_state *state, - const struct morus640_glue_ops *ops, - struct scatterlist *sg_src, unsigned int assoclen) -{ - struct scatter_walk walk; - struct morus640_block buf; - unsigned int pos = 0; - - scatterwalk_start(&walk, sg_src); - while (assoclen != 0) { - unsigned int size = scatterwalk_clamp(&walk, assoclen); - unsigned int left = size; - void *mapped = scatterwalk_map(&walk); - const u8 *src = (const u8 *)mapped; - - if (pos + size >= MORUS640_BLOCK_SIZE) { - if (pos > 0) { - unsigned int fill = MORUS640_BLOCK_SIZE - pos; - memcpy(buf.bytes + pos, src, fill); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - pos = 0; - left -= fill; - src += fill; - } - - ops->ad(state, src, left); - src += left & ~(MORUS640_BLOCK_SIZE - 1); - left &= MORUS640_BLOCK_SIZE - 1; - } - - memcpy(buf.bytes + pos, src, left); - - pos += left; - assoclen -= size; - scatterwalk_unmap(mapped); - scatterwalk_advance(&walk, size); - scatterwalk_done(&walk, 0, assoclen); - } - - if (pos > 0) { - memset(buf.bytes + pos, 0, MORUS640_BLOCK_SIZE - pos); - ops->ad(state, buf.bytes, MORUS640_BLOCK_SIZE); - } -} - -static void crypto_morus640_glue_process_crypt(struct morus640_state *state, - struct morus640_ops ops, - struct skcipher_walk *walk) -{ - while (walk->nbytes >= MORUS640_BLOCK_SIZE) { - ops.crypt_blocks(state, walk->src.virt.addr, - walk->dst.virt.addr, - round_down(walk->nbytes, MORUS640_BLOCK_SIZE)); - skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE); - } - - if (walk->nbytes) { - ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr, - walk->nbytes); - skcipher_walk_done(walk, 0); - } -} - -int crypto_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - - if (keylen != MORUS640_BLOCK_SIZE) { - crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - - memcpy(ctx->key.bytes, key, MORUS640_BLOCK_SIZE); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setkey); - -int crypto_morus640_glue_setauthsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - return (authsize <= MORUS_MAX_AUTH_SIZE) ? 0 : -EINVAL; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_setauthsize); - -static void crypto_morus640_glue_crypt(struct aead_request *req, - struct morus640_ops ops, - unsigned int cryptlen, - struct morus640_block *tag_xor) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_state state; - struct skcipher_walk walk; - - ops.skcipher_walk_init(&walk, req, true); - - kernel_fpu_begin(); - - ctx->ops->init(&state, &ctx->key, req->iv); - crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen); - crypto_morus640_glue_process_crypt(&state, ops, &walk); - ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen); - - kernel_fpu_end(); -} - -int crypto_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_encrypt, - .crypt_blocks = ctx->ops->enc, - .crypt_tail = ctx->ops->enc_tail, - }; - - struct morus640_block tag = {}; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen; - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - scatterwalk_map_and_copy(tag.bytes, req->dst, - req->assoclen + cryptlen, authsize, 1); - return 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_encrypt); - -int crypto_morus640_glue_decrypt(struct aead_request *req) -{ - static const u8 zeros[MORUS640_BLOCK_SIZE] = {}; - - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct morus640_ctx *ctx = crypto_aead_ctx(tfm); - struct morus640_ops OPS = { - .skcipher_walk_init = skcipher_walk_aead_decrypt, - .crypt_blocks = ctx->ops->dec, - .crypt_tail = ctx->ops->dec_tail, - }; - - struct morus640_block tag; - unsigned int authsize = crypto_aead_authsize(tfm); - unsigned int cryptlen = req->cryptlen - authsize; - - scatterwalk_map_and_copy(tag.bytes, req->src, - req->assoclen + cryptlen, authsize, 0); - - crypto_morus640_glue_crypt(req, OPS, cryptlen, &tag); - - return crypto_memneq(tag.bytes, zeros, authsize) ? -EBADMSG : 0; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_decrypt); - -void crypto_morus640_glue_init_ops(struct crypto_aead *aead, - const struct morus640_glue_ops *ops) -{ - struct morus640_ctx *ctx = crypto_aead_ctx(aead); - ctx->ops = ops; -} -EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); -MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index b871728e0b2f..13fd8d3d2da0 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -167,7 +167,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&serpent_enc_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -177,7 +177,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&serpent_dec_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 4a9a9f2ee1d8..7d3dca38a5a2 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -207,7 +207,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&serpent_enc_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -217,7 +217,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&serpent_dec_xts, req, XTS_TWEAK_CAST(__serpent_encrypt), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg serpent_algs[] = { diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 73867da3cbee..f9aff31fe59e 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -45,8 +45,8 @@ asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds); -static int sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_transform_fn *sha256_xform) +static int _sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha256_transform_fn *sha256_xform) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -84,7 +84,7 @@ static int sha256_finup(struct shash_desc *desc, const u8 *data, static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_ssse3); + return _sha256_update(desc, data, len, sha256_transform_ssse3); } static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, @@ -151,7 +151,7 @@ asmlinkage void sha256_transform_avx(u32 *digest, const char *data, static int sha256_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_avx); + return _sha256_update(desc, data, len, sha256_transform_avx); } static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, @@ -233,7 +233,7 @@ asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_transform_rorx); + return _sha256_update(desc, data, len, sha256_transform_rorx); } static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, @@ -313,7 +313,7 @@ asmlinkage void sha256_ni_transform(u32 *digest, const char *data, static int sha256_ni_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha256_update(desc, data, len, sha256_ni_transform); + return _sha256_update(desc, data, len, sha256_ni_transform); } static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 0dbf8e8b09d7..d561c821788b 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -210,7 +210,7 @@ static int xts_encrypt(struct skcipher_request *req) return glue_xts_req_128bit(&twofish_enc_xts, req, XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, false); } static int xts_decrypt(struct skcipher_request *req) @@ -220,7 +220,7 @@ static int xts_decrypt(struct skcipher_request *req) return glue_xts_req_128bit(&twofish_dec_xts, req, XTS_TWEAK_CAST(twofish_enc_blk), - &ctx->tweak_ctx, &ctx->crypt_ctx); + &ctx->tweak_ctx, &ctx->crypt_ctx, true); } static struct skcipher_alg twofish_algs[] = { diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 536b574b6161..3f8e22615812 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -285,15 +285,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); - /* - * NB: Native and x32 syscalls are dispatched from the same - * table. The only functional difference is the x32 bit in - * regs->orig_ax, which changes the behavior of some syscalls. - */ - nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); +#ifdef CONFIG_X86_X32_ABI + } else if (likely((nr & __X32_SYSCALL_BIT) && + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, + X32_NR_syscalls); + regs->ax = x32_sys_call_table[nr](regs); +#endif } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4f86928246e7..f83ca5aa8b77 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -63,7 +63,7 @@ * enough to patch inline, increasing performance. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) @@ -1084,7 +1084,7 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0, PER_CPU_VAR(__preempt_count) jnz .Lno_preempt @@ -1364,7 +1364,7 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp ret_from_intr diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index be9ca198c581..b7c3ea4cb19d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -664,7 +664,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) /* Returning to kernel space */ retint_kernel: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* Interrupts are off */ /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ @@ -1058,10 +1058,10 @@ ENTRY(native_load_gs_index) ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) - _ASM_EXTABLE(.Lgs_change, bad_gs) + _ASM_EXTABLE(.Lgs_change, .Lbad_gs) .section .fixup, "ax" /* running with kernelgs */ -bad_gs: +.Lbad_gs: SWAPGS /* switch back to user gs */ .macro ZAP_GS /* This can't be a string because the preprocessor needs to see it. */ @@ -1115,7 +1115,7 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */ call xen_evtchn_do_upcall LEAVE_IRQ_STACK -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall #endif jmp error_exit diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index d5252bc1e380..b1bf31713374 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -10,10 +10,13 @@ /* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ extern asmlinkage long sys_ni_syscall(const struct pt_regs *); #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include <asm/syscalls_64.h> #undef __SYSCALL_64 +#undef __SYSCALL_X32 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_X32(nr, sym, qual) asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* @@ -23,3 +26,25 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_64.h> }; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI + +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = sym, + +asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_x32_max] = &sys_ni_syscall, +#include <asm/syscalls_64.h> +}; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#endif diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index c00019abd076..3fe02546aed3 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -186,11 +186,11 @@ 172 i386 prctl sys_prctl __ia32_sys_prctl 173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending 177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend 180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread 181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite 182 i386 chown sys_chown16 __ia32_sys_chown16 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 94fcd1951aca..1af2be39e7d9 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -1,13 +1,13 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 in="$1" out="$2" syscall_macro() { - abi="$1" - nr="$2" - entry="$3" + local abi="$1" + local nr="$2" + local entry="$3" # Entry can be either just a function name or "function/qualifier" real_entry="${entry%%/*}" @@ -21,14 +21,14 @@ syscall_macro() { } emit() { - abi="$1" - nr="$2" - entry="$3" - compat="$4" - umlentry="" + local abi="$1" + local nr="$2" + local entry="$3" + local compat="$4" + local umlentry="" - if [ "$abi" = "64" -a -n "$compat" ]; then - echo "a compat entry for a 64-bit syscall makes no sense" >&2 + if [ "$abi" != "I386" -a -n "$compat" ]; then + echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 exit 1 fi @@ -62,14 +62,17 @@ grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then - # COMMON is the same as 64, except that we don't expect X32 - # programs to use it. Our expectation has nothing to do with - # any generated code, so treat them the same. emit 64 "$nr" "$entry" "$compat" + if [ "$abi" = "COMMON" ]; then + # COMMON means that this syscall exists in the same form for + # 64-bit and X32. + echo "#ifdef CONFIG_X86_X32_ABI" + emit X32 "$nr" "$entry" "$compat" + echo "#endif" + fi elif [ "$abi" = "X32" ]; then - # X32 is equivalent to 64 on an X32-compatible kernel. echo "#ifdef CONFIG_X86_X32_ABI" - emit 64 "$nr" "$entry" "$compat" + emit X32 "$nr" "$entry" "$compat" echo "#endif" elif [ "$abi" = "I386" ]; then emit "$abi" "$nr" "$entry" "$compat" diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index cb3464525b37..2713490611a3 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -34,7 +34,7 @@ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index cc20465b2867..ea5c4167086c 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -46,7 +46,7 @@ THUNK lockdep_sys_exit_thunk,lockdep_sys_exit #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION THUNK ___preempt_schedule, preempt_schedule THUNK ___preempt_schedule_notrace, preempt_schedule_notrace EXPORT_SYMBOL(___preempt_schedule) @@ -55,7 +55,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPT) + || defined(CONFIG_PREEMPTION) .L_restore: popq %r11 popq %r10 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 8df549138193..0f2154106d01 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -89,6 +89,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS # CFLAGS_REMOVE_vdso-note.o = -pg CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vvar.o = -pg @@ -128,7 +129,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso_and_check) -CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 targets += vdso32/vdso32.lds diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 349a61d8bf34..f5937742b290 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -122,7 +122,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) return vmf_insert_pfn(vma, vmf->address, - vmalloc_to_pfn(tsc_pg)); + virt_to_phys(tsc_pg) >> PAGE_SHIFT); } return VM_FAULT_SIGBUS; diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e7d35f60d53f..64c3e70b0556 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -5,12 +5,14 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/delay.h> +#include <linux/jiffies.h> #include <asm/apicdef.h> #include <asm/nmi.h> #include "../perf_event.h" -static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); +static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); +static unsigned long perf_nmi_window; static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -641,11 +643,12 @@ static void amd_pmu_disable_event(struct perf_event *event) * handler when multiple PMCs are active or PMC overflow while handling some * other source of an NMI. * - * Attempt to mitigate this by using the number of active PMCs to determine - * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset - * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the - * number of active PMCs or 2. The value of 2 is used in case an NMI does not - * arrive at the LAPIC in time to be collapsed into an already pending NMI. + * Attempt to mitigate this by creating an NMI window in which un-handled NMIs + * received during this window will be claimed. This prevents extending the + * window past when it is possible that latent NMIs should be received. The + * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has + * handled a counter. When an un-handled NMI is received, it will be claimed + * only if arriving within that window. */ static int amd_pmu_handle_irq(struct pt_regs *regs) { @@ -663,21 +666,19 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) handled = x86_pmu_handle_irq(regs); /* - * If a counter was handled, record the number of possible remaining - * NMIs that can occur. + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. */ if (handled) { - this_cpu_write(perf_nmi_counter, - min_t(unsigned int, 2, active)); + this_cpu_write(perf_nmi_tstamp, + jiffies + perf_nmi_window); return handled; } - if (!this_cpu_read(perf_nmi_counter)) + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) return NMI_DONE; - this_cpu_dec(perf_nmi_counter); - return NMI_HANDLED; } @@ -909,6 +910,9 @@ static int __init amd_core_pmu_init(void) if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; + /* Avoid calulating the value each time in the NMI handler */ + perf_nmi_window = msecs_to_jiffies(100); + switch (boot_cpu_data.x86) { case 0x15: pr_cont("Fam15h "); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 5b35b7ea5d72..26c36357c4c9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -377,7 +377,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); config &= ~perf_ibs->enable_mask; wrmsrl(hwc->config_base, config); } @@ -553,7 +554,8 @@ static struct perf_ibs perf_ibs_op = { }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, .max_period = IBS_OP_MAX_CNT << 4, @@ -614,7 +616,7 @@ fail: if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) - offset_max = 2; + offset_max = 3; else offset_max = 1; do { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4886fc66fd88..7b21455d7504 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1005,6 +1005,27 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, /* current number of events already accepted */ n = cpuc->n_events; + if (!cpuc->n_events) + cpuc->pebs_output = 0; + + if (!cpuc->is_fake && leader->attr.precise_ip) { + /* + * For PEBS->PT, if !aux_event, the group leader (PT) went + * away, the group was broken down and this singleton event + * can't schedule any more. + */ + if (is_pebs_pt(leader) && !leader->aux_event) + return -EINVAL; + + /* + * pebs_output: 0: no PEBS so far, 1: PT, 2: DS + */ + if (cpuc->pebs_output && + cpuc->pebs_output != is_pebs_pt(leader) + 1) + return -EINVAL; + + cpuc->pebs_output = is_pebs_pt(leader) + 1; + } if (is_x86_event(leader)) { if (n >= max_count) @@ -2241,6 +2262,17 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return 0; } +static int x86_pmu_aux_output_match(struct perf_event *event) +{ + if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) + return 0; + + if (x86_pmu.aux_output_match) + return x86_pmu.aux_output_match(event); + + return 0; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2266,6 +2298,8 @@ static struct pmu pmu = { .sched_task = x86_pmu_sched_task, .task_ctx_size = sizeof(struct x86_perf_task_context), .check_period = x86_pmu_check_period, + + .aux_output_match = x86_pmu_aux_output_match, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e4c2cb65ea50..fcef678c3423 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -18,6 +18,7 @@ #include <asm/cpufeature.h> #include <asm/hardirq.h> #include <asm/intel-family.h> +#include <asm/intel_pt.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> @@ -3298,6 +3299,13 @@ static int intel_pmu_hw_config(struct perf_event *event) } } + if (event->attr.aux_output) { + if (!event->attr.precise_ip) + return -EINVAL; + + event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -3816,6 +3824,14 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static int intel_pmu_aux_output_match(struct perf_event *event) +{ + if (!x86_pmu.intel_cap.pebs_output_pt_available) + return 0; + + return is_intel_pt_event(event); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -3940,6 +3956,8 @@ static __initconst const struct x86_pmu intel_pmu = { .sched_task = intel_pmu_sched_task, .check_period = intel_pmu_check_period, + + .aux_output_match = intel_pmu_aux_output_match, }; static __init void intel_clovertown_quirk(void) @@ -3969,31 +3987,31 @@ static __init void intel_clovertown_quirk(void) } static const struct x86_cpu_desc isolation_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_CORE, 3, 0x0000001f), - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_ULT, 1, 0x0000001e), - INTEL_CPU_DESC(INTEL_FAM6_HASWELL_GT3E, 1, 0x00000015), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL, 3, 0x0000001f), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_L, 1, 0x0000001e), + INTEL_CPU_DESC(INTEL_FAM6_HASWELL_G, 1, 0x00000015), INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037), INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_CORE, 4, 0x00000023), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_GT3E, 1, 0x00000014), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 2, 0x00000010), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 3, 0x07000009), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 4, 0x0f000009), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 5, 0x0e000002), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL, 4, 0x00000023), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_G, 1, 0x00000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 2, 0x00000010), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), - INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_MOBILE, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_DESKTOP, 3, 0x0000007c), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 9, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 10, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 11, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 12, 0x0000004e), - INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 13, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 9, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_L, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 10, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 11, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 12, 0x0000004e), + INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 13, 0x0000004e), {} }; @@ -4151,7 +4169,7 @@ static const struct x86_cpu_desc counter_freezing_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_X, 1, 0x00000028), + INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), {} @@ -4649,7 +4667,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_D: case INTEL_FAM6_ATOM_SILVERMONT_MID: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_AIRMONT_MID: @@ -4671,7 +4689,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_X: + case INTEL_FAM6_ATOM_GOLDMONT_D: x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4727,7 +4745,7 @@ __init int intel_pmu_init(void) name = "goldmont_plus"; break; - case INTEL_FAM6_ATOM_TREMONT_X: + case INTEL_FAM6_ATOM_TREMONT_D: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4863,10 +4881,10 @@ __init int intel_pmu_init(void) break; - case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: x86_add_quirk(intel_ht_bug); x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; @@ -4896,9 +4914,9 @@ __init int intel_pmu_init(void) name = "haswell"; break; - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; @@ -4961,10 +4979,12 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE_X: pmem = true; /* fall through */ - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5008,11 +5028,13 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ICELAKE_X: - case INTEL_FAM6_ICELAKE_XEON_D: + case INTEL_FAM6_ICELAKE_D: pmem = true; /* fall through */ - case INTEL_FAM6_ICELAKE_MOBILE: - case INTEL_FAM6_ICELAKE_DESKTOP: + case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 688592b34564..e1daf4151e11 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -45,46 +45,49 @@ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, - CNL + * CNL,KBL,CML * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 - * Available model: SNB,IVB,HSW,BDW,SKL,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, + * ICL,TGL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL + * GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,KBL,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL * Scope: Package (physical package) * */ @@ -446,7 +449,7 @@ static int cstate_cpu_init(unsigned int cpu) return 0; } -const struct attribute_group *core_attr_update[] = { +static const struct attribute_group *core_attr_update[] = { &group_cstate_core_c1, &group_cstate_core_c3, &group_cstate_core_c6, @@ -454,7 +457,7 @@ const struct attribute_group *core_attr_update[] = { NULL, }; -const struct attribute_group *pkg_attr_update[] = { +static const struct attribute_group *pkg_attr_update[] = { &group_cstate_pkg_c2, &group_cstate_pkg_c3, &group_cstate_pkg_c6, @@ -544,6 +547,19 @@ static const struct cstate_model cnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model icl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -593,40 +609,44 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_CORE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_GT3E, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_G, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_D, slm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_XEON_D, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_GT3E, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_D, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_G, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_L, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index f1269e804e9b..ce83950036c5 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -902,6 +902,9 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) */ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) { + if (cpuc->n_pebs == cpuc->n_pebs_via_pt) + return false; + return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); } @@ -919,6 +922,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) u64 threshold; int reserved; + if (cpuc->n_pebs_via_pt) + return; + if (x86_pmu.flags & PMU_FL_PEBS_ALL) reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; else @@ -1059,10 +1065,40 @@ void intel_pmu_pebs_add(struct perf_event *event) cpuc->n_pebs++; if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; + if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT) + cpuc->n_pebs_via_pt++; pebs_update_state(needed_cb, cpuc, event, true); } +static void intel_pmu_pebs_via_pt_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!is_pebs_pt(event)) + return; + + if (!(cpuc->pebs_enabled & ~PEBS_VIA_PT_MASK)) + cpuc->pebs_enabled &= ~PEBS_VIA_PT_MASK; +} + +static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; + + if (!is_pebs_pt(event)) + return; + + if (!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) + cpuc->pebs_enabled |= PEBS_PMI_AFTER_EACH_RECORD; + + cpuc->pebs_enabled |= PEBS_OUTPUT_PT; + + wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]); +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1100,6 +1136,8 @@ void intel_pmu_pebs_enable(struct perf_event *event) } else { ds->pebs_event_reset[hwc->idx] = 0; } + + intel_pmu_pebs_via_pt_enable(event); } void intel_pmu_pebs_del(struct perf_event *event) @@ -1111,6 +1149,8 @@ void intel_pmu_pebs_del(struct perf_event *event) cpuc->n_pebs--; if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; + if (hwc->flags & PERF_X86_EVENT_PEBS_VIA_PT) + cpuc->n_pebs_via_pt--; pebs_update_state(needed_cb, cpuc, event, false); } @@ -1120,7 +1160,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (cpuc->n_pebs == cpuc->n_large_pebs) + if (cpuc->n_pebs == cpuc->n_large_pebs && + cpuc->n_pebs != cpuc->n_pebs_via_pt) intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -1131,6 +1172,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); + intel_pmu_pebs_via_pt_disable(event); + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -2031,6 +2074,12 @@ void __init intel_ds_init(void) PERF_SAMPLE_REGS_INTR); } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + + if (x86_pmu.intel_cap.pebs_output_pt_available) { + pr_cont("PEBS-via-PT, "); + x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + } + break; default: diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 6f814a27416b..ea54634eabf3 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -273,7 +273,7 @@ static inline bool lbr_from_signext_quirk_needed(void) return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); } -DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); +static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); /* If quirk is enabled, ensure sign extension is 63 bits: */ inline u64 lbr_from_signext_quirk_wr(u64 val) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index d3dc2274ddd4..05e43d0f430b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -204,9 +204,9 @@ static int __init pt_pmu_hw_init(void) /* model-specific quirks */ switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: /* not setting BRANCH_EN will #GP, erratum BDM106 */ pt_pmu.branch_en_always_on = true; @@ -545,33 +545,62 @@ static void pt_config_buffer(void *buf, unsigned int topa_idx, wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg); } -/* - * Keep ToPA table-related metadata on the same page as the actual table, - * taking up a few words from the top - */ - -#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1) - /** - * struct topa - page-sized ToPA table with metadata at the top - * @table: actual ToPA table entries, as understood by PT hardware + * struct topa - ToPA metadata * @list: linkage to struct pt_buffer's list of tables - * @phys: physical address of this page * @offset: offset of the first entry in this table in the buffer * @size: total size of all entries in this table * @last: index of the last initialized entry in this table + * @z_count: how many times the first entry repeats */ struct topa { - struct topa_entry table[TENTS_PER_PAGE]; struct list_head list; - u64 phys; u64 offset; size_t size; int last; + unsigned int z_count; }; +/* + * Keep ToPA table-related metadata on the same page as the actual table, + * taking up a few words from the top + */ + +#define TENTS_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct topa)) / sizeof(struct topa_entry)) + +/** + * struct topa_page - page-sized ToPA table with metadata at the top + * @table: actual ToPA table entries, as understood by PT hardware + * @topa: metadata + */ +struct topa_page { + struct topa_entry table[TENTS_PER_PAGE]; + struct topa topa; +}; + +static inline struct topa_page *topa_to_page(struct topa *topa) +{ + return container_of(topa, struct topa_page, topa); +} + +static inline struct topa_page *topa_entry_to_page(struct topa_entry *te) +{ + return (struct topa_page *)((unsigned long)te & PAGE_MASK); +} + +static inline phys_addr_t topa_pfn(struct topa *topa) +{ + return PFN_DOWN(virt_to_phys(topa_to_page(topa))); +} + /* make -1 stand for the last table entry */ -#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)]) +#define TOPA_ENTRY(t, i) \ + ((i) == -1 \ + ? &topa_to_page(t)->table[(t)->last] \ + : &topa_to_page(t)->table[(i)]) +#define TOPA_ENTRY_SIZE(t, i) (sizes(TOPA_ENTRY((t), (i))->size)) +#define TOPA_ENTRY_PAGES(t, i) (1 << TOPA_ENTRY((t), (i))->size) /** * topa_alloc() - allocate page-sized ToPA table @@ -583,27 +612,26 @@ struct topa { static struct topa *topa_alloc(int cpu, gfp_t gfp) { int node = cpu_to_node(cpu); - struct topa *topa; + struct topa_page *tp; struct page *p; p = alloc_pages_node(node, gfp | __GFP_ZERO, 0); if (!p) return NULL; - topa = page_address(p); - topa->last = 0; - topa->phys = page_to_phys(p); + tp = page_address(p); + tp->topa.last = 0; /* * In case of singe-entry ToPA, always put the self-referencing END * link as the 2nd entry in the table */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT; - TOPA_ENTRY(topa, 1)->end = 1; + TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT; + TOPA_ENTRY(&tp->topa, 1)->end = 1; } - return topa; + return &tp->topa; } /** @@ -643,7 +671,7 @@ static void topa_insert_table(struct pt_buffer *buf, struct topa *topa) BUG_ON(last->last != TENTS_PER_PAGE - 1); - TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT; + TOPA_ENTRY(last, -1)->base = topa_pfn(topa); TOPA_ENTRY(last, -1)->end = 1; } @@ -670,7 +698,7 @@ static bool topa_table_full(struct topa *topa) * * Return: 0 on success or error code. */ -static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) +static int topa_insert_pages(struct pt_buffer *buf, int cpu, gfp_t gfp) { struct topa *topa = buf->last; int order = 0; @@ -681,13 +709,18 @@ static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp) order = page_private(p); if (topa_table_full(topa)) { - topa = topa_alloc(buf->cpu, gfp); + topa = topa_alloc(cpu, gfp); if (!topa) return -ENOMEM; topa_insert_table(buf, topa); } + if (topa->z_count == topa->last - 1) { + if (order == TOPA_ENTRY(topa, topa->last - 1)->size) + topa->z_count++; + } + TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT; TOPA_ENTRY(topa, -1)->size = order; if (!buf->snapshot && @@ -713,23 +746,26 @@ static void pt_topa_dump(struct pt_buffer *buf) struct topa *topa; list_for_each_entry(topa, &buf->tables, list) { + struct topa_page *tp = topa_to_page(topa); int i; - pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table, - topa->phys, topa->offset, topa->size); + pr_debug("# table @%p, off %llx size %zx\n", tp->table, + topa->offset, topa->size); for (i = 0; i < TENTS_PER_PAGE; i++) { pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n", - &topa->table[i], - (unsigned long)topa->table[i].base << TOPA_SHIFT, - sizes(topa->table[i].size), - topa->table[i].end ? 'E' : ' ', - topa->table[i].intr ? 'I' : ' ', - topa->table[i].stop ? 'S' : ' ', - *(u64 *)&topa->table[i]); + &tp->table[i], + (unsigned long)tp->table[i].base << TOPA_SHIFT, + sizes(tp->table[i].size), + tp->table[i].end ? 'E' : ' ', + tp->table[i].intr ? 'I' : ' ', + tp->table[i].stop ? 'S' : ' ', + *(u64 *)&tp->table[i]); if ((intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) && - topa->table[i].stop) || - topa->table[i].end) + tp->table[i].stop) || + tp->table[i].end) break; + if (!i && topa->z_count) + i += topa->z_count; } } } @@ -771,7 +807,7 @@ static void pt_update_head(struct pt *pt) /* offset of the current output region within this table */ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++) - base += sizes(buf->cur->table[topa_idx].size); + base += TOPA_ENTRY_SIZE(buf->cur, topa_idx); if (buf->snapshot) { local_set(&buf->data_size, base); @@ -791,7 +827,7 @@ static void pt_update_head(struct pt *pt) */ static void *pt_buffer_region(struct pt_buffer *buf) { - return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT); + return phys_to_virt(TOPA_ENTRY(buf->cur, buf->cur_idx)->base << TOPA_SHIFT); } /** @@ -800,7 +836,7 @@ static void *pt_buffer_region(struct pt_buffer *buf) */ static size_t pt_buffer_region_size(struct pt_buffer *buf) { - return sizes(buf->cur->table[buf->cur_idx].size); + return TOPA_ENTRY_SIZE(buf->cur, buf->cur_idx); } /** @@ -830,7 +866,7 @@ static void pt_handle_status(struct pt *pt) * know. */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + buf->output_off == pt_buffer_region_size(buf)) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; @@ -868,9 +904,11 @@ static void pt_handle_status(struct pt *pt) static void pt_read_offset(struct pt_buffer *buf) { u64 offset, base_topa; + struct topa_page *tp; rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa); - buf->cur = phys_to_virt(base_topa); + tp = phys_to_virt(base_topa); + buf->cur = &tp->topa; rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset); /* offset within current output region */ @@ -879,29 +917,97 @@ static void pt_read_offset(struct pt_buffer *buf) buf->cur_idx = (offset & 0xffffff80) >> 7; } -/** - * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry - * @buf: PT buffer. - * @pg: Page offset in the buffer. - * - * When advancing to the next output region (ToPA entry), given a page offset - * into the buffer, we need to find the offset of the first page in the next - * region. - */ -static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) +static struct topa_entry * +pt_topa_entry_for_page(struct pt_buffer *buf, unsigned int pg) { - struct topa_entry *te = buf->topa_index[pg]; + struct topa_page *tp; + struct topa *topa; + unsigned int idx, cur_pg = 0, z_pg = 0, start_idx = 0; - /* one region */ - if (buf->first == buf->last && buf->first->last == 1) - return pg; + /* + * Indicates a bug in the caller. + */ + if (WARN_ON_ONCE(pg >= buf->nr_pages)) + return NULL; + + /* + * First, find the ToPA table where @pg fits. With high + * order allocations, there shouldn't be many of these. + */ + list_for_each_entry(topa, &buf->tables, list) { + if (topa->offset + topa->size > pg << PAGE_SHIFT) + goto found; + } + + /* + * Hitting this means we have a problem in the ToPA + * allocation code. + */ + WARN_ON_ONCE(1); - do { - pg++; - pg &= buf->nr_pages - 1; - } while (buf->topa_index[pg] == te); + return NULL; - return pg; +found: + /* + * Indicates a problem in the ToPA allocation code. + */ + if (WARN_ON_ONCE(topa->last == -1)) + return NULL; + + tp = topa_to_page(topa); + cur_pg = PFN_DOWN(topa->offset); + if (topa->z_count) { + z_pg = TOPA_ENTRY_PAGES(topa, 0) * (topa->z_count + 1); + start_idx = topa->z_count + 1; + } + + /* + * Multiple entries at the beginning of the table have the same size, + * ideally all of them; if @pg falls there, the search is done. + */ + if (pg >= cur_pg && pg < cur_pg + z_pg) { + idx = (pg - cur_pg) / TOPA_ENTRY_PAGES(topa, 0); + return &tp->table[idx]; + } + + /* + * Otherwise, slow path: iterate through the remaining entries. + */ + for (idx = start_idx, cur_pg += z_pg; idx < topa->last; idx++) { + if (cur_pg + TOPA_ENTRY_PAGES(topa, idx) > pg) + return &tp->table[idx]; + + cur_pg += TOPA_ENTRY_PAGES(topa, idx); + } + + /* + * Means we couldn't find a ToPA entry in the table that does match. + */ + WARN_ON_ONCE(1); + + return NULL; +} + +static struct topa_entry * +pt_topa_prev_entry(struct pt_buffer *buf, struct topa_entry *te) +{ + unsigned long table = (unsigned long)te & ~(PAGE_SIZE - 1); + struct topa_page *tp; + struct topa *topa; + + tp = (struct topa_page *)table; + if (tp->table != te) + return --te; + + topa = &tp->topa; + if (topa == buf->first) + topa = buf->last; + else + topa = list_prev_entry(topa, list); + + tp = topa_to_page(topa); + + return &tp->table[topa->last - 1]; } /** @@ -925,8 +1031,7 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long idx, npages, wakeup; /* can't stop in the middle of an output region */ - if (buf->output_off + handle->size + 1 < - sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) { + if (buf->output_off + handle->size + 1 < pt_buffer_region_size(buf)) { perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); return -EINVAL; } @@ -937,9 +1042,13 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, return 0; /* clear STOP and INT from current entry */ - buf->topa_index[buf->stop_pos]->stop = 0; - buf->topa_index[buf->stop_pos]->intr = 0; - buf->topa_index[buf->intr_pos]->intr = 0; + if (buf->stop_te) { + buf->stop_te->stop = 0; + buf->stop_te->intr = 0; + } + + if (buf->intr_te) + buf->intr_te->intr = 0; /* how many pages till the STOP marker */ npages = handle->size >> PAGE_SHIFT; @@ -950,7 +1059,12 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, idx = (head >> PAGE_SHIFT) + npages; idx &= buf->nr_pages - 1; - buf->stop_pos = idx; + + if (idx != buf->stop_pos) { + buf->stop_pos = idx; + buf->stop_te = pt_topa_entry_for_page(buf, idx); + buf->stop_te = pt_topa_prev_entry(buf, buf->stop_te); + } wakeup = handle->wakeup >> PAGE_SHIFT; @@ -960,51 +1074,20 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, idx = wakeup; idx &= buf->nr_pages - 1; - buf->intr_pos = idx; + if (idx != buf->intr_pos) { + buf->intr_pos = idx; + buf->intr_te = pt_topa_entry_for_page(buf, idx); + buf->intr_te = pt_topa_prev_entry(buf, buf->intr_te); + } - buf->topa_index[buf->stop_pos]->stop = 1; - buf->topa_index[buf->stop_pos]->intr = 1; - buf->topa_index[buf->intr_pos]->intr = 1; + buf->stop_te->stop = 1; + buf->stop_te->intr = 1; + buf->intr_te->intr = 1; return 0; } /** - * pt_buffer_setup_topa_index() - build topa_index[] table of regions - * @buf: PT buffer. - * - * topa_index[] references output regions indexed by offset into the - * buffer for purposes of quick reverse lookup. - */ -static void pt_buffer_setup_topa_index(struct pt_buffer *buf) -{ - struct topa *cur = buf->first, *prev = buf->last; - struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), - *te_prev = TOPA_ENTRY(prev, prev->last - 1); - int pg = 0, idx = 0; - - while (pg < buf->nr_pages) { - int tidx; - - /* pages within one topa entry */ - for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++) - buf->topa_index[pg] = te_prev; - - te_prev = te_cur; - - if (idx == cur->last - 1) { - /* advance to next topa table */ - idx = 0; - cur = list_entry(cur->list.next, struct topa, list); - } else { - idx++; - } - te_cur = TOPA_ENTRY(cur, idx); - } - -} - -/** * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head * @buf: PT buffer. * @head: Write pointer (aux_head) from AUX buffer. @@ -1021,18 +1104,20 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) */ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) { + struct topa_page *cur_tp; + struct topa_entry *te; int pg; if (buf->snapshot) head &= (buf->nr_pages << PAGE_SHIFT) - 1; pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1); - pg = pt_topa_next_entry(buf, pg); + te = pt_topa_entry_for_page(buf, pg); - buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK); - buf->cur_idx = ((unsigned long)buf->topa_index[pg] - - (unsigned long)buf->cur) / sizeof(struct topa_entry); - buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1); + cur_tp = topa_entry_to_page(te); + buf->cur = &cur_tp->topa; + buf->cur_idx = te - TOPA_ENTRY(buf->cur, 0); + buf->output_off = head & (pt_buffer_region_size(buf) - 1); local64_set(&buf->head, head); local_set(&buf->data_size, 0); @@ -1061,31 +1146,29 @@ static void pt_buffer_fini_topa(struct pt_buffer *buf) * @size: Total size of all regions within this ToPA. * @gfp: Allocation flags. */ -static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages, - gfp_t gfp) +static int pt_buffer_init_topa(struct pt_buffer *buf, int cpu, + unsigned long nr_pages, gfp_t gfp) { struct topa *topa; int err; - topa = topa_alloc(buf->cpu, gfp); + topa = topa_alloc(cpu, gfp); if (!topa) return -ENOMEM; topa_insert_table(buf, topa); while (buf->nr_pages < nr_pages) { - err = topa_insert_pages(buf, gfp); + err = topa_insert_pages(buf, cpu, gfp); if (err) { pt_buffer_fini_topa(buf); return -ENOMEM; } } - pt_buffer_setup_topa_index(buf); - /* link last table to the first one, unless we're double buffering */ if (intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT; + TOPA_ENTRY(buf->last, -1)->base = topa_pfn(buf->first); TOPA_ENTRY(buf->last, -1)->end = 1; } @@ -1119,18 +1202,18 @@ pt_buffer_setup_aux(struct perf_event *event, void **pages, cpu = raw_smp_processor_id(); node = cpu_to_node(cpu); - buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]), - GFP_KERNEL, node); + buf = kzalloc_node(sizeof(struct pt_buffer), GFP_KERNEL, node); if (!buf) return NULL; - buf->cpu = cpu; buf->snapshot = snapshot; buf->data_pages = pages; + buf->stop_pos = -1; + buf->intr_pos = -1; INIT_LIST_HEAD(&buf->tables); - ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL); + ret = pt_buffer_init_topa(buf, cpu, nr_pages, GFP_KERNEL); if (ret) { kfree(buf); return NULL; @@ -1296,7 +1379,7 @@ void intel_pt_interrupt(void) return; } - pt_config_buffer(buf->cur->table, buf->cur_idx, + pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, buf->output_off); pt_config(event); } @@ -1361,7 +1444,7 @@ static void pt_event_start(struct perf_event *event, int mode) WRITE_ONCE(pt->handle_nmi, 1); hwc->state = 0; - pt_config_buffer(buf->cur->table, buf->cur_idx, + pt_config_buffer(topa_to_page(buf->cur)->table, buf->cur_idx, buf->output_off); pt_config(event); @@ -1481,6 +1564,11 @@ void cpu_emergency_stop_pt(void) pt_event_stop(pt->handle.event, PERF_EF_UPDATE); } +int is_intel_pt_event(struct perf_event *event) +{ + return event->pmu == &pt_pmu.pmu; +} + static __init int pt_init(void) { int ret, cpu, prior_warn = 0; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index 63fe4063fbd6..1d2bb7572374 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -53,7 +53,6 @@ struct pt_pmu { /** * struct pt_buffer - buffer configuration; one buffer per task_struct or * cpu, depending on perf event configuration - * @cpu: cpu for per-cpu allocation * @tables: list of ToPA tables in this buffer * @first: shorthand for first topa table * @last: shorthand for last topa table @@ -65,13 +64,14 @@ struct pt_pmu { * @lost: if data was lost/truncated * @head: logical write offset inside the buffer * @snapshot: if this is for a snapshot/overwrite counter - * @stop_pos: STOP topa entry in the buffer - * @intr_pos: INT topa entry in the buffer + * @stop_pos: STOP topa entry index + * @intr_pos: INT topa entry index + * @stop_te: STOP topa entry pointer + * @intr_te: INT topa entry pointer * @data_pages: array of pages from perf * @topa_index: table of topa entries indexed by page offset */ struct pt_buffer { - int cpu; struct list_head tables; struct topa *first, *last, *cur; unsigned int cur_idx; @@ -80,9 +80,9 @@ struct pt_buffer { local_t data_size; local64_t head; bool snapshot; - unsigned long stop_pos, intr_pos; + long stop_pos, intr_pos; + struct topa_entry *stop_te, *intr_te; void **data_pages; - struct topa_entry *topa_index[0]; }; #define PT_FILTERS_NUM 4 diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 64ab51ffdf06..5053a403e4ae 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -634,7 +634,7 @@ static void cleanup_rapl_pmus(void) kfree(rapl_pmus); } -const struct attribute_group *rapl_attr_update[] = { +static const struct attribute_group *rapl_attr_update[] = { &rapl_events_cores_group, &rapl_events_pkg_group, &rapl_events_ram_group, @@ -720,27 +720,27 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, model_hsx), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE, model_skl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L, model_skl), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D, model_hsw), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 3694a5d0703d..86467f85c383 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -502,10 +502,8 @@ void uncore_pmu_event_start(struct perf_event *event, int flags) local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); uncore_enable_event(box, event); - if (box->n_active == 1) { - uncore_enable_box(box); + if (box->n_active == 1) uncore_pmu_start_hrtimer(box); - } } void uncore_pmu_event_stop(struct perf_event *event, int flags) @@ -529,10 +527,8 @@ void uncore_pmu_event_stop(struct perf_event *event, int flags) WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - if (box->n_active == 0) { - uncore_disable_box(box); + if (box->n_active == 0) uncore_pmu_cancel_hrtimer(box); - } } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { @@ -778,6 +774,40 @@ static int uncore_pmu_event_init(struct perf_event *event) return ret; } +static void uncore_pmu_enable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->enable_box) + uncore_pmu->type->ops->enable_box(box); +} + +static void uncore_pmu_disable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->disable_box) + uncore_pmu->type->ops->disable_box(box); +} + static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { @@ -803,6 +833,8 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) pmu->pmu = (struct pmu) { .attr_groups = pmu->type->attr_groups, .task_ctx_nr = perf_invalid_context, + .pmu_enable = uncore_pmu_enable, + .pmu_disable = uncore_pmu_disable, .event_init = uncore_pmu_event_init, .add = uncore_pmu_event_add, .del = uncore_pmu_event_del, @@ -1451,29 +1483,29 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_L, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_G, hsw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL, bdw_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, bdw_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, bdx_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_X, snr_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index f36f7bebbc1b..bbfdaa720b45 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -441,18 +441,6 @@ static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, return -EINVAL; } -static inline void uncore_disable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->disable_box) - box->pmu->type->ops->disable_box(box); -} - -static inline void uncore_enable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->enable_box) - box->pmu->type->ops->enable_box(box); -} - static inline void uncore_disable_event(struct intel_uncore_box *box, struct perf_event *event) { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 9431447541e9..6f86650b3f77 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -59,22 +59,22 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_IVYBRIDGE: case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_CORE: + case INTEL_FAM6_HASWELL: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_XEON_D: - case INTEL_FAM6_BROADWELL_GT3E: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_D: + case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_ATOM_SILVERMONT: - case INTEL_FAM6_ATOM_SILVERMONT_X: + case INTEL_FAM6_ATOM_SILVERMONT_D: case INTEL_FAM6_ATOM_AIRMONT: case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_X: + case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_GOLDMONT_PLUS: @@ -84,12 +84,19 @@ static bool test_intel(int idx, void *data) return true; break; - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - case INTEL_FAM6_ICELAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: + case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; @@ -167,7 +174,7 @@ static const struct attribute_group *attr_groups[] = { NULL, }; -const struct attribute_group *attr_update[] = { +static const struct attribute_group *attr_update[] = { &group_aperf, &group_mperf, &group_pperf, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8751008fc170..ecacfbf4ebc1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -76,6 +76,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -85,6 +86,11 @@ struct amd_nb { }; #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) +#define PEBS_PMI_AFTER_EACH_RECORD BIT_ULL(60) +#define PEBS_OUTPUT_OFFSET 61 +#define PEBS_OUTPUT_MASK (3ull << PEBS_OUTPUT_OFFSET) +#define PEBS_OUTPUT_PT (1ull << PEBS_OUTPUT_OFFSET) +#define PEBS_VIA_PT_MASK (PEBS_OUTPUT_PT | PEBS_PMI_AFTER_EACH_RECORD) /* * Flags PEBS can handle without an PMI. @@ -211,6 +217,8 @@ struct cpu_hw_events { u64 pebs_enabled; int n_pebs; int n_large_pebs; + int n_pebs_via_pt; + int pebs_output; /* Current super set of events hardware configuration */ u64 pebs_data_cfg; @@ -510,6 +518,8 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; + u64 pebs_metrics_available:1; + u64 pebs_output_pt_available:1; }; u64 capabilities; }; @@ -692,6 +702,8 @@ struct x86_pmu { * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 period); + + int (*aux_output_match) (struct perf_event *event); }; struct x86_perf_task_context { @@ -901,6 +913,11 @@ static inline int amd_pmu_init(void) #endif /* CONFIG_CPU_SUP_AMD */ +static inline int is_pebs_pt(struct perf_event *event) +{ + return !!(event->hw.flags & PERF_X86_EVENT_PEBS_VIA_PT); +} + #ifdef CONFIG_CPU_SUP_INTEL static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5c056b8aebef..e01078e93dd3 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -260,11 +260,21 @@ void __init hv_apic_init(void) } if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { - pr_info("Hyper-V: Using MSR based APIC access\n"); + pr_info("Hyper-V: Using enlightened APIC (%s mode)", + x2apic_enabled() ? "x2apic" : "xapic"); + /* + * With x2apic, architectural x2apic MSRs are equivalent to the + * respective synthetic MSRs, so there's no need to override + * the apic accessors. The only exception is + * hv_apic_eoi_write, because it benefits from lazy EOI when + * available, but it works for both xapic and x2apic modes. + */ apic_set_eoi_write(hv_apic_eoi_write); - apic->read = hv_apic_read; - apic->write = hv_apic_write; - apic->icr_write = hv_apic_icr_write; - apic->icr_read = hv_apic_icr_read; + if (!x2apic_enabled()) { + apic->read = hv_apic_read; + apic->write = hv_apic_write; + apic->icr_write = hv_apic_icr_write; + apic->icr_read = hv_apic_icr_read; + } } } diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 0d258688c8cf..2db3972c0e0f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -37,6 +37,20 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); u32 hv_max_vp_index; EXPORT_SYMBOL_GPL(hv_max_vp_index); +void *hv_alloc_hyperv_page(void) +{ + BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE); + + return (void *)__get_free_page(GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); + +void hv_free_hyperv_page(unsigned long addr) +{ + free_page(addr); +} +EXPORT_SYMBOL_GPL(hv_free_hyperv_page); + static int hv_cpu_init(unsigned int cpu) { u64 msr_vp_index; @@ -301,8 +315,6 @@ void __init hyperv_init(void) x86_init.pci.arch_init = hv_pci_init; - /* Register Hyper-V specific clocksource */ - hv_init_clocksource(); return; remove_cpuhp_state: diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index e65d7fe6489f..5208ba49c89a 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -37,12 +37,14 @@ static inline int fill_gva_list(u64 gva_list[], int offset, * Lower 12 bits encode the number of additional * pages to flush (in addition to the 'cur' page). */ - if (diff >= HV_TLB_FLUSH_UNIT) + if (diff >= HV_TLB_FLUSH_UNIT) { gva_list[gva_n] |= ~PAGE_MASK; - else if (diff) + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } - cur += HV_TLB_FLUSH_UNIT; gva_n++; } while (cur < end); diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aac686e1e005..bc9693c9107e 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -117,6 +117,12 @@ static inline bool acpi_has_cpu_in_madt(void) return !!acpi_lapic; } +#define ACPI_HAVE_ARCH_SET_ROOT_POINTER +static inline void acpi_arch_set_root_pointer(u64 addr) +{ + x86_init.acpi.set_root_pointer(addr); +} + #define ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { @@ -125,6 +131,7 @@ static inline u64 acpi_arch_get_root_pointer(void) void acpi_generic_reduced_hw_init(void); +void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); #else /* !CONFIG_ACPI */ @@ -138,6 +145,8 @@ static inline void disable_acpi(void) { } static inline void acpi_generic_reduced_hw_init(void) { } +static inline void x86_default_set_root_pointer(u64 addr) { } + static inline u64 x86_default_get_root_pointer(void) { return 0; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 094fbc9c0b1c..13adca37c99a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -201,10 +201,10 @@ static inline int alternatives_text_reserved(void *start, void *end) * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") + asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") #define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ - asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") /* * Alternative inline assembly with input. @@ -218,7 +218,7 @@ static inline int alternatives_text_reserved(void *start, void *end) * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : : "i" (0), ## input) /* @@ -231,18 +231,18 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \ feature2, input...) \ - asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ + asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \ newinstr2, feature2) \ : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) /* Like alternative_io, but for replacing a direct call with another one. */ #define alternative_call(oldfunc, newfunc, feature, output, input...) \ - asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ + asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) /* @@ -253,7 +253,7 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ - asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e647aa095867..2ebc17d9c72c 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -136,6 +136,7 @@ extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); +extern void apic_soft_disable(void); extern void lapic_shutdown(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); @@ -176,6 +177,8 @@ extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); +extern void apic_send_IPI_allbutself(unsigned int vector); + #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } #define local_apic_timer_c2_ok 1 @@ -465,12 +468,6 @@ static inline unsigned default_get_apic_id(unsigned long x) #define TRAMPOLINE_PHYS_LOW 0x467 #define TRAMPOLINE_PHYS_HIGH 0x469 -#ifdef CONFIG_X86_64 -extern void apic_send_IPI_self(int vector); - -DECLARE_PER_CPU(int, x2apic_extra_bits); -#endif - extern void generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC @@ -506,8 +503,10 @@ extern int default_check_phys_apicid_present(int phys_apicid); #ifdef CONFIG_SMP bool apic_id_is_primary_thread(unsigned int id); +void apic_smt_update(void); #else static inline bool apic_id_is_primary_thread(unsigned int id) { return false; } +static inline void apic_smt_update(void) { } #endif extern void irq_enter(void); diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h deleted file mode 100644 index d3a2b3876ce6..000000000000 --- a/arch/x86/include/asm/apic_flat_64.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_APIC_FLAT_64_H -#define _ASM_X86_APIC_FLAT_64_H - -extern void flat_init_apic_ldr(void); - -#endif - diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 84f848c2541a..7f828fe49797 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -49,8 +49,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define array_index_mask_nospec array_index_mask_nospec /* Prevent speculative execution past this barrier. */ -#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ - "lfence", X86_FEATURE_LFENCE_RDTSC) +#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) #define dma_rmb() barrier() #define dma_wmb() barrier() diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ba15d53c1ca7..7d1f6a49bfae 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -45,14 +45,13 @@ * We do the locked ops that don't return the old value as * a mask operation on a byte. */ -#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "orb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr)) @@ -72,7 +71,7 @@ arch___set_bit(long nr, volatile unsigned long *addr) static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "andb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)~CONST_MASK(nr))); @@ -123,7 +122,7 @@ arch___change_bit(long nr, volatile unsigned long *addr) static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { - if (IS_IMMEDIATE(nr)) { + if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "xorb %1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" ((u8)CONST_MASK(nr))); diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 9e5f3c722c33..981fe923a59f 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -70,6 +70,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(eddbuf_entries), BOOT_PARAM_PRESERVE(edd_mbr_sig_buf_entries), BOOT_PARAM_PRESERVE(edd_mbr_sig_buffer), + BOOT_PARAM_PRESERVE(secure_boot), BOOT_PARAM_PRESERVE(hdr), BOOT_PARAM_PRESERVE(e820_table), BOOT_PARAM_PRESERVE(eddbuf), diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 6804d6642767..facba9bc30ca 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -32,7 +32,7 @@ #define _BUG_FLAGS(ins, flags) \ do { \ - asm volatile("1:\t" ins "\n" \ + asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ @@ -49,7 +49,7 @@ do { \ #define _BUG_FLAGS(ins, flags) \ do { \ - asm volatile("1:\t" ins "\n" \ + asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 542509b53e0f..794eb2129bc6 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -18,4 +18,6 @@ int ppro_with_ram_bug(void); static inline int ppro_with_ram_bug(void) { return 0; } #endif +extern void cpu_bugs_smt_update(void); + #endif /* _ASM_X86_BUGS_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index cff3f3f3bfe0..8348f7d69fd5 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPU_ENTRY_AREA_H #define _ASM_X86_CPU_ENTRY_AREA_H diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 58acda503817..59bf91c57aa8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -61,6 +61,13 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) +/* + * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the + * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all + * header macros which use NCAPINTS need to be changed. The duplicated macro + * use causes the compiler to issue errors for all headers so that all usage + * sites can be corrected. + */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e880f2408e29..c4fbe379cc0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,6 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -232,6 +231,8 @@ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ #define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ +#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */ +#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ @@ -355,6 +356,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -397,5 +399,7 @@ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/cpuidle_haltpoll.h b/arch/x86/include/asm/cpuidle_haltpoll.h new file mode 100644 index 000000000000..c8b39c6716ff --- /dev/null +++ b/arch/x86/include/asm/cpuidle_haltpoll.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_HALTPOLL_H +#define _ARCH_HALTPOLL_H + +void arch_haltpoll_enable(unsigned int cpu); +void arch_haltpoll_disable(unsigned int cpu); + +#endif diff --git a/arch/x86/include/asm/crypto/aes.h b/arch/x86/include/asm/crypto/aes.h deleted file mode 100644 index c508521dd190..000000000000 --- a/arch/x86/include/asm/crypto/aes.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_AES_H -#define ASM_X86_AES_H - -#include <linux/crypto.h> -#include <crypto/aes.h> - -void crypto_aes_encrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, - const u8 *src); -void crypto_aes_decrypt_x86(struct crypto_aes_ctx *ctx, u8 *dst, - const u8 *src); -#endif diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index d1818634ae7e..8d4a8e1226ee 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h @@ -114,7 +114,7 @@ extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx, struct skcipher_request *req, common_glue_func_t tweak_fn, void *tweak_ctx, - void *crypt_ctx); + void *crypt_ctx, bool decrypt); extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, common_glue_func_t fn); diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 20a46150e0a8..9b8cb50768c2 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -73,6 +73,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include <asm-generic/div64.h> + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + u64 q; + + asm ("mulq %2; divq %3" : "=a" (q) + : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "rdx"); + + return q; +} +#define mul_u64_u32_div mul_u64_u32_div + #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_DIV64_H */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 606a4b6a9812..43a82e59c59d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -242,6 +242,7 @@ static inline bool efi_is_64bit(void) __efi_early()->runtime_services), __VA_ARGS__) extern bool efi_reboot_required(void); +extern bool efi_is_table_address(unsigned long phys_addr); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} @@ -249,6 +250,10 @@ static inline bool efi_reboot_required(void) { return false; } +static inline bool efi_is_table_address(unsigned long phys_addr) +{ + return false; +} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h deleted file mode 100644 index 47b7a1296245..000000000000 --- a/arch/x86/include/asm/error-injection.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_ERROR_INJECTION_H -#define _ASM_ERROR_INJECTION_H - -#include <linux/compiler.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> -#include <asm-generic/error-injection.h> - -asmlinkage void just_return_func(void); -void override_function_with_return(struct pt_regs *regs); - -#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index cbd97e22d2f3..4154bc5f6a4e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -153,8 +153,8 @@ extern char irq_entries_start[]; extern char spurious_entries_start[]; #define VECTOR_UNUSED NULL -#define VECTOR_SHUTDOWN ((void *)~0UL) -#define VECTOR_RETRIGGERED ((void *)~1UL) +#define VECTOR_SHUTDOWN ((void *)-1L) +#define VECTOR_RETRIGGERED ((void *)-2L) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index af78cd72b8f3..7741e211f7f5 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -13,6 +13,16 @@ #include <asm/page.h> /* + * While not explicitly listed in the TLFS, Hyper-V always runs with a page size + * of 4096. These definitions are used when communicating with Hyper-V using + * guest physical pages and guest physical page addresses, since the guest page + * size may not be 4096 on all architectures. + */ +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +/* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). */ @@ -170,7 +180,15 @@ /* Recommend using enlightened VMCS */ #define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) +/* + * Virtual processor will never share a physical core with another virtual + * processor, except for virtual processors that are reported as sibling SMT + * threads. + */ +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) + /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ +#define HV_X64_NESTED_DIRECT_FLUSH BIT(17) #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) @@ -514,14 +532,24 @@ struct hv_timer_message_payload { __u64 delivery_time; /* When the message was delivered */ } __packed; +struct hv_nested_enlightenments_control { + struct { + __u32 directhypercall:1; + __u32 reserved:31; + } features; + struct { + __u32 reserved; + } hypercallControls; +} __packed; + /* Define virtual processor assist page structure. */ struct hv_vp_assist_page { __u32 apic_assist; - __u32 reserved; - __u64 vtl_control[2]; - __u64 nested_enlightenments_control[2]; - __u32 enlighten_vmentry; - __u32 padding; + __u32 reserved1; + __u64 vtl_control[3]; + struct hv_nested_enlightenments_control nested_control; + __u8 enlighten_vmentry; + __u8 reserved2[7]; __u64 current_nested_vmcs; } __packed; @@ -847,7 +875,7 @@ union hv_gpa_page_range { * count is equal with how many entries of union hv_gpa_page_range can * be populated into the input parameter page. */ -#define HV_MAX_FLUSH_REP_COUNT ((PAGE_SIZE - 2 * sizeof(u64)) / \ +#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ sizeof(union hv_gpa_page_range)) struct hv_guest_mapping_flush_list { @@ -872,4 +900,7 @@ struct hv_tlb_flush_ex { u64 gva_list[]; } __packed; +struct hv_partition_assist_pg { + u32 tlb_lock_count; +}; #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index fe7c205233f1..c606c0b70738 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -5,9 +5,6 @@ /* * "Big Core" Processors (Branded as Core, Xeon, etc...) * - * The "_X" parts are generally the EP and EX Xeons, or the - * "Extreme" ones, like Broadwell-E, or Atom microserver. - * * While adding a new CPUID for a new microarchitecture, add a new * group to keep logically sorted out in chronological order. Within * that group keep the CPUID for the variants sorted by model number. @@ -21,9 +18,19 @@ * MICROARCH Is the code name for the micro-architecture for this core. * N.B. Not the platform name. * OPTDIFF If needed, a short string to differentiate by market segment. - * Exact strings here will vary over time. _DESKTOP, _MOBILE, and - * _X (short for Xeon server) should be used when they are - * appropriate. + * + * Common OPTDIFFs: + * + * - regular client parts + * _L - regular mobile parts + * _G - parts with extra graphics on + * _X - regular server parts + * _D - micro server parts + * + * Historical OPTDIFFs: + * + * _EP - 2 socket server parts + * _EX - 4+ socket server parts * * The #define line may optionally include a comment including platform names. */ @@ -49,30 +56,36 @@ #define INTEL_FAM6_IVYBRIDGE 0x3A #define INTEL_FAM6_IVYBRIDGE_X 0x3E -#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL 0x3C #define INTEL_FAM6_HASWELL_X 0x3F -#define INTEL_FAM6_HASWELL_ULT 0x45 -#define INTEL_FAM6_HASWELL_GT3E 0x46 +#define INTEL_FAM6_HASWELL_L 0x45 +#define INTEL_FAM6_HASWELL_G 0x46 -#define INTEL_FAM6_BROADWELL_CORE 0x3D -#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL 0x3D +#define INTEL_FAM6_BROADWELL_G 0x47 #define INTEL_FAM6_BROADWELL_X 0x4F -#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_D 0x56 -#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E -#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_L 0x4E +#define INTEL_FAM6_SKYLAKE 0x5E #define INTEL_FAM6_SKYLAKE_X 0x55 -#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E -#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E +#define INTEL_FAM6_KABYLAKE_L 0x8E +#define INTEL_FAM6_KABYLAKE 0x9E -#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66 +#define INTEL_FAM6_CANNONLAKE_L 0x66 #define INTEL_FAM6_ICELAKE_X 0x6A -#define INTEL_FAM6_ICELAKE_XEON_D 0x6C -#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D -#define INTEL_FAM6_ICELAKE_MOBILE 0x7E +#define INTEL_FAM6_ICELAKE_D 0x6C +#define INTEL_FAM6_ICELAKE 0x7D +#define INTEL_FAM6_ICELAKE_L 0x7E #define INTEL_FAM6_ICELAKE_NNPI 0x9D +#define INTEL_FAM6_TIGERLAKE_L 0x8C +#define INTEL_FAM6_TIGERLAKE 0x8D + +#define INTEL_FAM6_COMETLAKE 0xA5 +#define INTEL_FAM6_COMETLAKE_L 0xA6 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ @@ -83,17 +96,21 @@ #define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ #define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ -#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ +#define INTEL_FAM6_ATOM_SILVERMONT_D 0x4D /* Avaton, Rangely */ #define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ #define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ +#define INTEL_FAM6_ATOM_AIRMONT_NP 0x75 /* Lightning Mountain */ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ -#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ +#define INTEL_FAM6_ATOM_GOLDMONT_D 0x5F /* Denverton */ + +/* Note: the micro-architecture is "Goldmont Plus" */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ -#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ +#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */ +#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h index 634f99b1dc22..423b788f495e 100644 --- a/arch/x86/include/asm/intel_pt.h +++ b/arch/x86/include/asm/intel_pt.h @@ -28,10 +28,12 @@ enum pt_capabilities { void cpu_emergency_stop_pt(void); extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap); extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap); +extern int is_intel_pt_event(struct perf_event *event); #else static inline void cpu_emergency_stop_pt(void) {} static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; } static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; } +static inline int is_intel_pt_event(struct perf_event *event) { return 0; } #endif #endif /* _ASM_X86_INTEL_PT_H */ diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index baedab8ac538..b91623d521d9 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -4,7 +4,6 @@ extern int force_iommu, no_iommu; extern int iommu_detected; -extern int iommu_pass_through; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h deleted file mode 100644 index f73076be546a..000000000000 --- a/arch/x86/include/asm/ipi.h +++ /dev/null @@ -1,109 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ASM_X86_IPI_H -#define _ASM_X86_IPI_H - -#ifdef CONFIG_X86_LOCAL_APIC - -/* - * Copyright 2004 James Cleverdon, IBM. - * - * Generic APIC InterProcessor Interrupt code. - * - * Moved to include file by James Cleverdon from - * arch/x86-64/kernel/smp.c - * - * Copyrights from kernel/smp.c: - * - * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> - * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> - * (c) 2002,2003 Andi Kleen, SuSE Labs. - */ - -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/smp.h> - -/* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. - */ - -static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, - unsigned int dest) -{ - unsigned int icr = shortcut | dest; - - switch (vector) { - default: - icr |= APIC_DM_FIXED | vector; - break; - case NMI_VECTOR: - icr |= APIC_DM_NMI; - break; - } - return icr; -} - -static inline int __prepare_ICR2(unsigned int mask) -{ - return SET_APIC_DEST_FIELD(mask); -} - -static inline void __xapic_wait_icr_idle(void) -{ - while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); - -/* - * This is used to send an IPI with no shorthand notation (the destination is - * specified in bits 56 to 63 of the ICR). - */ -void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); - -extern void default_send_IPI_single(int cpu, int vector); -extern void default_send_IPI_single_phys(int cpu, int vector); -extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, - int vector); - -/* Avoid include hell */ -#define NMI_VECTOR 0x02 - -extern int no_broadcast; - -static inline void __default_local_send_IPI_allbutself(int vector) -{ - if (no_broadcast || vector == NMI_VECTOR) - apic->send_IPI_mask_allbutself(cpu_online_mask, vector); - else - __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, apic->dest_logical); -} - -static inline void __default_local_send_IPI_all(int vector) -{ - if (no_broadcast || vector == NMI_VECTOR) - apic->send_IPI_mask(cpu_online_mask, vector); - else - __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector, apic->dest_logical); -} - -#ifdef CONFIG_X86_32 -extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_mask_logical(const struct cpumask *mask, - int vector); -extern void default_send_IPI_allbutself(int vector); -extern void default_send_IPI_all(int vector); -extern void default_send_IPI_self(int vector); -#endif - -#endif - -#endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8f95686ec27e..a176f6165d85 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -34,7 +34,7 @@ extern __visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs); extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); -extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs); +extern void handle_irq(struct irq_desc *desc, struct pt_regs *regs); extern __visible unsigned int do_IRQ(struct pt_regs *regs); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index feab24cac610..77cf6c11f66b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,7 @@ struct x86_emulate_ops { int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); - + int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -429,6 +429,7 @@ enum x86_intercept { x86_intercept_ins, x86_intercept_out, x86_intercept_outs, + x86_intercept_xsetbv, nr_x86_intercepts }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74e88e5edd9c..4fc61483919a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -219,13 +219,6 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting - * with the SVE bit in EPT PTEs. - */ -#define SPTE_SPECIAL_MASK (1ULL << 62) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -319,8 +312,12 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + bool unsync; + u8 mmu_valid_gen; bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* * The following two entries are used to key the shadow page in the @@ -717,7 +714,7 @@ struct kvm_vcpu_arch { /* Cache MMIO info */ u64 mmio_gva; - unsigned access; + unsigned mmio_access; gfn_t mmio_gfn; u64 mmio_gen; @@ -843,6 +840,8 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + + struct hv_partition_assist_pg *hv_pa_pg; }; enum kvm_irqchip_mode { @@ -856,11 +855,14 @@ struct kvm_arch { unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; + u8 mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. */ struct list_head active_mmu_pages; + struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -935,6 +937,7 @@ struct kvm_arch { bool exception_payload_enabled; struct kvm_pmu_event_filter *pmu_event_filter; + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { @@ -948,6 +951,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; @@ -1070,7 +1074,7 @@ struct kvm_x86_ops { void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); - void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, @@ -1191,7 +1195,7 @@ struct kvm_x86_ops { int (*set_nested_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); @@ -1209,6 +1213,9 @@ struct kvm_x86_ops { uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + + bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); + int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1308,26 +1315,52 @@ extern u64 kvm_default_tsc_scaling_ratio; extern u64 kvm_mce_cap_supported; -enum emulation_result { - EMULATE_DONE, /* no further processing */ - EMULATE_USER_EXIT, /* kvm_run ready for userspace exit */ - EMULATE_FAIL, /* can't emulate this instruction */ -}; - +/* + * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing + * userspace I/O) to indicate that the emulation context + * should be resued as is, i.e. skip initialization of + * emulation context, instruction fetch and decode. + * + * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. + * Indicates that only select instructions (tagged with + * EmulateOnUD) should be emulated (to minimize the emulator + * attack surface). See also EMULTYPE_TRAP_UD_FORCED. + * + * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to + * decode the instruction length. For use *only* by + * kvm_x86_ops->skip_emulated_instruction() implementations. + * + * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to + * retry native execution under certain conditions. + * + * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was + * triggered by KVM's magic "force emulation" prefix, + * which is opt in via module param (off by default). + * Bypasses EmulateOnUD restriction despite emulating + * due to an intercepted #UD (see EMULTYPE_TRAP_UD). + * Used to test the full emulator from userspace. + * + * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware + * backdoor emulation, which is opt in via module param. + * VMware backoor emulation handles select instructions + * and reinjects the #GP for all other cases. + */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY (1 << 3) -#define EMULTYPE_NO_UD_ON_FAIL (1 << 4) -#define EMULTYPE_VMWARE (1 << 5) +#define EMULTYPE_TRAP_UD_FORCED (1 << 4) +#define EMULTYPE_VMWARE_GP (1 << 5) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); struct x86_emulate_ctxt; @@ -1500,7 +1533,7 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void __noreturn kvm_spurious_fault(void); +asmlinkage void kvm_spurious_fault(void); /* * Hardware virtualization extension instructions may fault if a @@ -1508,24 +1541,14 @@ asmlinkage void __noreturn kvm_spurious_fault(void); * Usually after catching the fault we just panic; during reboot * instead the instruction is ignored. */ -#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn) \ +#define __kvm_handle_fault_on_reboot(insn) \ "666: \n\t" \ insn "\n\t" \ "jmp 668f \n\t" \ "667: \n\t" \ "call kvm_spurious_fault \n\t" \ "668: \n\t" \ - ".pushsection .fixup, \"ax\" \n\t" \ - "700: \n\t" \ - cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting\n\t" \ - "je 667b \n\t" \ - "jmp 668b \n\t" \ - ".popsection \n\t" \ - _ASM_EXTABLE(666b, 700b) - -#define __kvm_handle_fault_on_reboot(insn) \ - ____kvm_handle_fault_on_reboot(insn, "") + _ASM_EXTABLE(666b, 667b) #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); @@ -1581,6 +1604,13 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); +static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) +{ + /* We can only post Fixed and LowPrio IRQs */ + return (irq->delivery_mode == dest_Fixed || + irq->delivery_mode == dest_LowestPrio); +} + static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { if (kvm_x86_ops->vcpu_blocking) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 0c196c47d621..848ce43b9040 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -92,6 +92,16 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; +static inline bool mem_encrypt_active(void) +{ + return sme_me_mask; +} + +static inline u64 sme_get_me_mask(void) +{ + return sme_me_mask; +} + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 2ef31cc8c529..f4138aeb4280 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -218,7 +218,8 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) void __init hyperv_init(void); void hyperv_setup_mmu_ops(void); - +void *hv_alloc_hyperv_page(void); +void hv_free_hyperv_page(unsigned long addr); void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); @@ -241,6 +242,8 @@ static inline void hv_apic_init(void) {} #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline void hyperv_setup_mmu_ops(void) {} +static inline void *hv_alloc_hyperv_page(void) { return NULL; } +static inline void hv_free_hyperv_page(unsigned long addr) {} static inline void set_hv_tscchange_cb(void (*cb)(void)) {} static inline void clear_hv_tscchange_cb(void) {} static inline void hyperv_stop_tsc_emulation(void) {}; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 271d837d69a8..6a3124664289 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -93,6 +93,18 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -103,6 +115,10 @@ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 @@ -375,14 +391,22 @@ /* Alternative perfctr range with full access. */ #define MSR_IA32_PMC0 0x000004c1 -/* AMD64 MSRs. Not complete. See the architecture manual for a more - complete list. */ +/* Auto-reload via MSR instead of DS area */ +#define MSR_RELOAD_PMC0 0x000014c1 +#define MSR_RELOAD_FIXED_CTR0 0x00001309 +/* + * AMD64 MSRs. Not complete. See the architecture manual for a more + * complete list. + */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD_PERF_CTL 0xc0010062 +#define MSR_AMD_PERF_STATUS 0xc0010063 +#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_LS_CFG 0xc0011020 @@ -561,9 +585,6 @@ #define MSR_IA32_PERF_STATUS 0x00000198 #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff -#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 -#define MSR_AMD_PERF_STATUS 0xc0010063 -#define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 5cc3930cb465..86f20d520a07 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -233,8 +233,7 @@ static __always_inline unsigned long long rdtsc_ordered(void) * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ - asm volatile(ALTERNATIVE_3("rdtsc", - "mfence; rdtsc", X86_FEATURE_MFENCE_RDTSC, + asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index e28f8b723b5c..9d5252c9685c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -21,7 +21,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_LOOPS ((u32)-1) -#define MWAITX_DISABLE_CSTATES 0xf +#define MWAITX_DISABLE_CSTATES 0xf0 static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 80bc209c0708..5c24a7b35166 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include <asm/segment.h> /** - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the @@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dce26f1d13e1..69089d46f128 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -139,18 +139,6 @@ static inline void __write_cr4(unsigned long x) PVOP_VCALL1(cpu.write_cr4, x); } -#ifdef CONFIG_X86_64 -static inline unsigned long read_cr8(void) -{ - return PVOP_CALL0(unsigned long, cpu.read_cr8); -} - -static inline void write_cr8(unsigned long x) -{ - PVOP_VCALL1(cpu.write_cr8, x); -} -#endif - static inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 639b2df445ee..70b654f3ffe5 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -119,11 +119,6 @@ struct pv_cpu_ops { void (*write_cr4)(unsigned long); -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - /* Segment descriptor handling */ void (*load_tr_desc)(void); void (*load_gdt)(const struct desc_ptr *); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index c78da8eda8f2..0dca7f7aeff2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -29,8 +29,6 @@ extern pgd_t swapper_pg_dir[1024]; extern pgd_t initial_page_table[1024]; extern pmd_t initial_pg_pmd[]; -static inline void pgtable_cache_init(void) { } -static inline void check_pgt_cache(void) { } void paging_init(void); void sync_initial_page_table(void); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4990d26dfc73..0b6c4042942a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -241,9 +241,6 @@ extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) - #define PAGE_AGP PAGE_KERNEL_NOCACHE #define HAVE_PAGE_AGP 1 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 99a7fa9ab0a3..3d4cb83a8828 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -102,7 +102,7 @@ static __always_inline bool should_resched(int preempt_offset) return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION extern asmlinkage void ___preempt_schedule(void); # define __preempt_schedule() \ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6e0a3b43d027..54f5d54280f6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -988,4 +988,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 5df09a0b80b8..07375b476c4f 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTI_H #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index bd5ac6cc37db..444d6fd9a6d8 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -63,10 +63,25 @@ static inline bool vcpu_is_preempted(long cpu) #endif #ifdef CONFIG_PARAVIRT +/* + * virt_spin_lock_key - enables (by default) the virt_spin_lock() hijack. + * + * Native (and PV wanting native due to vCPU pinning) should disable this key. + * It is done in this backwards fashion to only have a single direction change, + * which removes ordering between native_pv_spin_init() and HV setup. + */ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); void native_pv_lock_init(void) __init; +/* + * Shortcut for the queued_spin_lock_slowpath() function that allows + * virt to hijack it. + * + * Returns: + * true - lock has been negotiated, all done; + * false - queued_spin_lock_slowpath() will do its thing. + */ #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index c53682303c9c..09ecc32f6524 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -20,7 +20,6 @@ struct real_mode_header { u32 ro_end; /* SMP trampoline */ u32 trampoline_start; - u32 trampoline_status; u32 trampoline_header; #ifdef CONFIG_X86_64 u32 trampoline_pgd; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index e1356a3b8223..e15f364efbcc 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -143,6 +143,7 @@ void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); +void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 219be88a59d2..6d37b8fcfc77 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -73,20 +73,6 @@ static inline unsigned long native_read_cr4(void) void native_write_cr4(unsigned long val); -#ifdef CONFIG_X86_64 -static inline unsigned long native_read_cr8(void) -{ - unsigned long cr8; - asm volatile("movq %%cr8,%0" : "=r" (cr8)); - return cr8; -} - -static inline void native_write_cr8(unsigned long val) -{ - asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); -} -#endif - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { @@ -200,16 +186,6 @@ static inline void wbinvd(void) #ifdef CONFIG_X86_64 -static inline unsigned long read_cr8(void) -{ - return native_read_cr8(); -} - -static inline void write_cr8(unsigned long x) -{ - native_write_cr8(x); -} - static inline void load_gs_index(unsigned selector) { native_load_gs_index(selector); diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index a7af9f53c0cb..35bb35d28733 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -34,7 +34,7 @@ struct saved_context { */ unsigned long kernelmode_gs_base, usermode_gs_base, fs_base; - unsigned long cr0, cr2, cr3, cr4, cr8; + unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; bool misc_enable_saved; struct saved_msrs saved_msrs; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index dec9c1e84c78..6ece8561ba66 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -52,6 +52,7 @@ enum { INTERCEPT_MWAIT, INTERCEPT_MWAIT_COND, INTERCEPT_XSETBV, + INTERCEPT_RDPRU, }; diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 2dc4a021beea..8db3fdb6102e 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -36,6 +36,10 @@ extern const sys_call_ptr_t sys_call_table[]; extern const sys_call_ptr_t ia32_sys_call_table[]; #endif +#ifdef CONFIG_X86_X32_ABI +extern const sys_call_ptr_t x32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 70c09967a999..5e8319bb207a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,8 +45,8 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. - * On the local CPU you need to be protected again NMI or MCE handlers seeing an - * inconsistent instruction while you patch. + * On the local CPU you need to be protected against NMI or MCE handlers seeing + * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 9c4435307ff8..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -444,8 +444,10 @@ __pu_label: \ ({ \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ + __typeof__(ptr) __gu_ptr = (ptr); \ + __typeof__(size) __gu_size = (size); \ __uaccess_begin_nospec(); \ - __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ + __get_user_size(__gu_val, __gu_ptr, __gu_size, __gu_err, -EFAULT); \ __uaccess_end(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ __builtin_expect(__gu_err, 0); \ @@ -732,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 097589753fec..a7dd080749ce 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -5,12 +5,6 @@ #include <uapi/asm/unistd.h> -# ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_MASK (~(__X32_SYSCALL_BIT)) -# else -# define __SYSCALL_MASK (~0) -# endif - # ifdef CONFIG_X86_32 # include <asm/unistd_32.h> diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index e60c45fd3679..6bc6d89d8e2a 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -12,10 +12,12 @@ struct mm_struct; #ifdef CONFIG_X86_UV #include <linux/efi.h> +extern unsigned long uv_systab_phys; + extern enum uv_system_type get_uv_system_type(void); static inline bool is_early_uv_system(void) { - return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab); + return uv_systab_phys && uv_systab_phys != EFI_INVALID_TABLE_ADDR; } extern int is_uv_system(void); extern int is_uv_hubless(void); diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h index ba71a63cdac4..e9ee139cf29e 100644 --- a/arch/x86/include/asm/vdso/gettimeofday.h +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -51,7 +51,7 @@ extern struct pvclock_vsyscall_time_info pvclock_page __attribute__((visibility("hidden"))); #endif -#ifdef CONFIG_HYPERV_TSCPAGE +#ifdef CONFIG_HYPERV_TIMER extern struct ms_hyperv_tsc_page hvclock_page __attribute__((visibility("hidden"))); #endif @@ -228,7 +228,7 @@ static u64 vread_pvclock(void) } #endif -#ifdef CONFIG_HYPERV_TSCPAGE +#ifdef CONFIG_HYPERV_TIMER static u64 vread_hvclock(void) { return hv_read_tsc_page(&hvclock_page); @@ -251,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode) return vread_pvclock(); } #endif -#ifdef CONFIG_HYPERV_TSCPAGE +#ifdef CONFIG_HYPERV_TIMER if (clock_mode == VCLOCK_HVCLOCK) { barrier(); return vread_hvclock(); diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h new file mode 100644 index 000000000000..ac9fc51e2b18 --- /dev/null +++ b/arch/x86/include/asm/vmware.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 or MIT */ +#ifndef _ASM_X86_VMWARE_H +#define _ASM_X86_VMWARE_H + +#include <asm/cpufeatures.h> +#include <asm/alternative.h> +#include <linux/stringify.h> + +/* + * The hypercall definitions differ in the low word of the %edx argument + * in the following way: the old port base interface uses the port + * number to distinguish between high- and low bandwidth versions. + * + * The new vmcall interface instead uses a set of flags to select + * bandwidth mode and transfer direction. The flags should be loaded + * into %dx by any user and are automatically replaced by the port + * number if the VMWARE_HYPERVISOR_PORT method is used. + * + * In short, new driver code should strictly use the new definition of + * %dx content. + */ + +/* Old port-based version */ +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_PORT_HB 0x5659 + +/* Current vmcall / vmmcall version */ +#define VMWARE_HYPERVISOR_HB BIT(0) +#define VMWARE_HYPERVISOR_OUT BIT(1) + +/* The low bandwidth call. The low word of edx is presumed clear. */ +#define VMWARE_HYPERCALL \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \ + "inl (%%dx), %%eax", \ + "vmcall", X86_FEATURE_VMCALL, \ + "vmmcall", X86_FEATURE_VMW_VMMCALL) + +/* + * The high bandwidth out call. The low word of edx is presumed to have the + * HB and OUT bits set. + */ +#define VMWARE_HYPERCALL_HB_OUT \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep outsb", \ + "vmcall", X86_FEATURE_VMCALL, \ + "vmmcall", X86_FEATURE_VMW_VMMCALL) + +/* + * The high bandwidth in call. The low word of edx is presumed to have the + * HB bit set. + */ +#define VMWARE_HYPERCALL_HB_IN \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep insb", \ + "vmcall", X86_FEATURE_VMCALL, \ + "vmmcall", X86_FEATURE_VMW_VMMCALL) +#endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a39136b0d509..1835767aa335 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,7 @@ #define SECONDARY_EXEC_PT_USE_GPA 0x01000000 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 +#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -110,6 +111,7 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 #define VMX_MISC_ZERO_LEN_INS 0x40000000 +#define VMX_MISC_MSR_LIST_MULTIPLIER 512 /* VMFUNC functions */ #define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 @@ -562,6 +564,20 @@ enum vm_instruction_error_number { VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28, }; +/* + * VM-instruction errors that can be encountered on VM-Enter, used to trace + * nested VM-Enter failures reported by hardware. Errors unique to VM-Enter + * from a SMI Transfer Monitor are not included as things have gone seriously + * sideways if we get one of those... + */ +#define VMX_VMENTER_INSTRUCTION_ERRORS \ + { VMXERR_VMLAUNCH_NONCLEAR_VMCS, "VMLAUNCH_NONCLEAR_VMCS" }, \ + { VMXERR_VMRESUME_NONLAUNCHED_VMCS, "VMRESUME_NONLAUNCHED_VMCS" }, \ + { VMXERR_VMRESUME_AFTER_VMXOFF, "VMRESUME_AFTER_VMXOFF" }, \ + { VMXERR_ENTRY_INVALID_CONTROL_FIELD, "VMENTRY_INVALID_CONTROL_FIELD" }, \ + { VMXERR_ENTRY_INVALID_HOST_STATE_FIELD, "VMENTRY_INVALID_HOST_STATE_FIELD" }, \ + { VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS, "VMENTRY_EVENTS_BLOCKED_BY_MOV_SS" } + enum vmx_l1d_flush_state { VMENTER_L1D_FLUSH_AUTO, VMENTER_L1D_FLUSH_NEVER, diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ac0934189017..19435858df5f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -134,10 +134,12 @@ struct x86_hyper_init { /** * struct x86_init_acpi - x86 ACPI init functions + * @set_root_poitner: set RSDP address * @get_root_pointer: get RSDP address * @reduced_hw_early_init: hardware reduced platform early init */ struct x86_init_acpi { + void (*set_root_pointer)(u64 addr); u64 (*get_root_pointer)(void); void (*reduced_hw_early_init)(void); }; diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index 116777e7f387..63cd41b2e17a 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -21,18 +21,4 @@ static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, free_pages((unsigned long) cpu_addr, get_order(size)); } -static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) { } - -static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs) { } - -static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) { } - -static inline void xen_dma_sync_single_for_device(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) { } - #endif /* _ASM_X86_XEN_PAGE_COHERENT_H */ diff --git a/arch/x86/include/uapi/asm/errno.h b/arch/x86/include/uapi/asm/errno.h deleted file mode 100644 index 4c82b503d92f..000000000000 --- a/arch/x86/include/uapi/asm/errno.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/errno.h> diff --git a/arch/x86/include/uapi/asm/fcntl.h b/arch/x86/include/uapi/asm/fcntl.h deleted file mode 100644 index 46ab12db5739..000000000000 --- a/arch/x86/include/uapi/asm/fcntl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/fcntl.h> diff --git a/arch/x86/include/uapi/asm/ioctl.h b/arch/x86/include/uapi/asm/ioctl.h deleted file mode 100644 index b279fe06dfe5..000000000000 --- a/arch/x86/include/uapi/asm/ioctl.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctl.h> diff --git a/arch/x86/include/uapi/asm/ioctls.h b/arch/x86/include/uapi/asm/ioctls.h deleted file mode 100644 index ec34c760665e..000000000000 --- a/arch/x86/include/uapi/asm/ioctls.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ioctls.h> diff --git a/arch/x86/include/uapi/asm/ipcbuf.h b/arch/x86/include/uapi/asm/ipcbuf.h deleted file mode 100644 index 84c7e51cb6d0..000000000000 --- a/arch/x86/include/uapi/asm/ipcbuf.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/ipcbuf.h> diff --git a/arch/x86/include/uapi/asm/param.h b/arch/x86/include/uapi/asm/param.h deleted file mode 100644 index 965d45427975..000000000000 --- a/arch/x86/include/uapi/asm/param.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/param.h> diff --git a/arch/x86/include/uapi/asm/resource.h b/arch/x86/include/uapi/asm/resource.h deleted file mode 100644 index 04bc4db8921b..000000000000 --- a/arch/x86/include/uapi/asm/resource.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/resource.h> diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index a9731f8a480f..2e8a30f06c74 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -75,6 +75,7 @@ #define SVM_EXIT_MWAIT 0x08b #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_RDPRU 0x08e #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 #define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 diff --git a/arch/x86/include/uapi/asm/termbits.h b/arch/x86/include/uapi/asm/termbits.h deleted file mode 100644 index 3935b106de79..000000000000 --- a/arch/x86/include/uapi/asm/termbits.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termbits.h> diff --git a/arch/x86/include/uapi/asm/termios.h b/arch/x86/include/uapi/asm/termios.h deleted file mode 100644 index 280d78a9d966..000000000000 --- a/arch/x86/include/uapi/asm/termios.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/termios.h> diff --git a/arch/x86/include/uapi/asm/types.h b/arch/x86/include/uapi/asm/types.h deleted file mode 100644 index 9d5c11a24279..000000000000 --- a/arch/x86/include/uapi/asm/types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_X86_TYPES_H -#define _ASM_X86_TYPES_H - -#include <asm-generic/types.h> - -#endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 30d7d04d72d6..196fdd02b8b1 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -3,7 +3,7 @@ #define _UAPI_ASM_X86_UNISTD_H /* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000 +#define __X32_SYSCALL_BIT 0x40000000UL #ifndef __KERNEL__ # ifdef __i386__ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..3eb8411ab60e 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -31,6 +31,7 @@ #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 #define EXIT_REASON_TRIPLE_FAULT 2 +#define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_PENDING_INTERRUPT 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -85,11 +86,14 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 +#define EXIT_REASON_UMWAIT 67 +#define EXIT_REASON_TPAUSE 68 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \ { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ @@ -142,7 +146,9 @@ { EXIT_REASON_RDSEED, "RDSEED" }, \ { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" } + { EXIT_REASON_XRSTORS, "XRSTORS" }, \ + { EXIT_REASON_UMWAIT, "UMWAIT" }, \ + { EXIT_REASON_TPAUSE, "TPAUSE" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 17b33ef604f3..04205ce127a1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1760,6 +1760,11 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) e820__update_table_print(); } +void x86_default_set_root_pointer(u64 addr) +{ + boot_params.acpi_rsdp_addr = addr; +} + u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b0715c3ac18d..7f9ade13bbcf 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -18,8 +18,13 @@ ENTRY(wakeup_long64) movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax - jne bogus_64_magic + je 2f + /* stop here on a saved_magic mismatch */ + movq $0xbad6d61676963, %rcx +1: + jmp 1b +2: movw $__KERNEL_DS, %ax movw %ax, %ss movw %ax, %ds @@ -37,9 +42,6 @@ ENTRY(wakeup_long64) jmp *%rax ENDPROC(wakeup_long64) -bogus_64_magic: - jmp bogus_64_magic - ENTRY(do_suspend_lowlevel) FRAME_BEGIN subq $8, %rsp diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ccd32013c47a..9d3a971ea364 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -713,7 +713,7 @@ void __init alternative_instructions(void) * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this * case we must do something about the corruption. - * Ignoring it is worse than a unlikely patching race. + * Ignoring it is worse than an unlikely patching race. * Also machine checks tend to be broadcast and if one CPU * goes into machine check the others follow quickly, so we don't * expect a machine check to cause undue problems during to code @@ -753,8 +753,8 @@ void __init alternative_instructions(void) * When you use this code to patch more than one byte of an instruction * you need to make sure that other CPUs cannot execute this code in parallel. * Also no thread must be currently preempted in the middle of these - * instructions. And on the local CPU you need to be protected again NMI or MCE - * handlers seeing an inconsistent instruction while you patch. + * instructions. And on the local CPU you need to be protected against NMI or + * MCE handlers seeing an inconsistent instruction while you patch. */ void __init_or_module text_poke_early(void *addr, const void *opcode, size_t len) diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a585ea6f686a..a6ac3712db8b 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -677,7 +677,10 @@ static const struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc = gart_alloc_coherent, .free = gart_free_coherent, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, }; static void gart_iommu_shutdown(void) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index d63e63b7d1d9..251c795b4eb3 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,6 +21,7 @@ #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -50,6 +51,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -63,6 +65,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dba2828b779a..2b0faf86da1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -65,10 +65,10 @@ unsigned int num_processors; unsigned disabled_cpus; /* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; +unsigned int boot_cpu_physical_apicid __ro_after_init = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); -u8 boot_cpu_apic_version; +u8 boot_cpu_apic_version __ro_after_init; /* * The highest APIC ID seen during enumeration. @@ -85,13 +85,13 @@ physid_mask_t phys_cpu_present_map; * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to * avoid undefined behaviour caused by sending INIT from AP to BSP. */ -static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; +static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; /* * This variable controls which CPUs receive external NMIs. By default, * external NMIs are delivered only to the BSP. */ -static int apic_extnmi = APIC_EXTNMI_BSP; +static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; /* * Map cpu index to physical APIC ID @@ -114,7 +114,7 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); /* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; +static int enabled_via_apicbase __ro_after_init; /* * Handle interrupt mode configuration register (IMCR). @@ -172,23 +172,23 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -unsigned long mp_lapic_addr; -int disable_apic; +unsigned long mp_lapic_addr __ro_after_init; +int disable_apic __ro_after_init; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; +int local_apic_timer_c2_ok __ro_after_init; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +int apic_verbosity __ro_after_init; -int pic_mode; +int pic_mode __ro_after_init; /* Have we found an MP table */ -int smp_found_config; +int smp_found_config __ro_after_init; static struct resource lapic_resource = { .name = "Local APIC", @@ -199,7 +199,7 @@ unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); -static unsigned long apic_phys; +static unsigned long apic_phys __ro_after_init; /* * Get the LAPIC version @@ -590,21 +590,21 @@ static u32 skx_deadline_rev(void) static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), - DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_D, bdx_deadline_rev), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL, 0x22), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_L, 0x20), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_CORE, 0x25), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_GT3E, 0x17), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL, 0x25), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_G, 0x17), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_MOBILE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_DESKTOP, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_L, 0xb2), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE, 0xb2), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_MOBILE, 0x52), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_DESKTOP, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE_L, 0x52), + DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_KABYLAKE, 0x52), {}, }; @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false; @@ -1179,10 +1183,6 @@ void clear_local_APIC(void) apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (!x2apic_enabled()) { - v = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - apic_write(APIC_LDR, v); - } if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); @@ -1224,25 +1224,38 @@ void clear_local_APIC(void) } /** - * disable_local_APIC - clear and disable the local APIC + * apic_soft_disable - Clears and software disables the local APIC on hotplug + * + * Contrary to disable_local_APIC() this does not touch the enable bit in + * MSR_IA32_APICBASE. Clearing that bit on systems based on the 3 wire APIC + * bus would require a hardware reset as the APIC would lose track of bus + * arbitration. On systems with FSB delivery APICBASE could be disabled, + * but it has to be guaranteed that no interrupt is sent to the APIC while + * in that state and it's not clear from the SDM whether it still responds + * to INIT/SIPI messages. Stay on the safe side and use software disable. */ -void disable_local_APIC(void) +void apic_soft_disable(void) { - unsigned int value; - - /* APIC hasn't been mapped yet */ - if (!x2apic_mode && !apic_phys) - return; + u32 value; clear_local_APIC(); - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ + /* Soft disable APIC (implies clearing of registers for 82489DX!). */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + apic_soft_disable(); #ifdef CONFIG_X86_32 /* @@ -1307,7 +1320,7 @@ void __init sync_Arb_IDs(void) APIC_INT_LEVELTRIG | APIC_DM_INIT); } -enum apic_intr_mode_id apic_intr_mode; +enum apic_intr_mode_id apic_intr_mode __ro_after_init; static int __init apic_intr_mode_select(void) { @@ -1495,54 +1508,72 @@ static void lapic_setup_esr(void) oldvalue, value); } -static void apic_pending_intr_clear(void) +#define APIC_IR_REGS APIC_ISR_NR +#define APIC_IR_BITS (APIC_IR_REGS * 32) +#define APIC_IR_MAPSIZE (APIC_IR_BITS / BITS_PER_LONG) + +union apic_ir { + unsigned long map[APIC_IR_MAPSIZE]; + u32 regs[APIC_IR_REGS]; +}; + +static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr) { - long long max_loops = cpu_khz ? cpu_khz : 1000000; - unsigned long long tsc = 0, ntsc; - unsigned int queued; - unsigned long value; - int i, j, acked = 0; + int i, bit; + + /* Read the IRRs */ + for (i = 0; i < APIC_IR_REGS; i++) + irr->regs[i] = apic_read(APIC_IRR + i * 0x10); + + /* Read the ISRs */ + for (i = 0; i < APIC_IR_REGS; i++) + isr->regs[i] = apic_read(APIC_ISR + i * 0x10); - if (boot_cpu_has(X86_FEATURE_TSC)) - tsc = rdtsc(); /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + * If the ISR map is not empty. ACK the APIC and run another round + * to verify whether a pending IRR has been unblocked and turned + * into a ISR. */ - do { - queued = 0; - for (i = APIC_ISR_NR - 1; i >= 0; i--) - queued |= apic_read(APIC_IRR + i*0x10); - - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for_each_set_bit(j, &value, 32) { - ack_APIC_irq(); - acked++; - } - } - if (acked > 256) { - pr_err("LAPIC pending interrupts after %d EOI\n", acked); - break; - } - if (queued) { - if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { - ntsc = rdtsc(); - max_loops = (long long)cpu_khz << 10; - max_loops -= ntsc - tsc; - } else { - max_loops--; - } - } - } while (queued && max_loops > 0); - WARN_ON(max_loops <= 0); + if (!bitmap_empty(isr->map, APIC_IR_BITS)) { + /* + * There can be multiple ISR bits set when a high priority + * interrupt preempted a lower priority one. Issue an ACK + * per set bit. + */ + for_each_set_bit(bit, isr->map, APIC_IR_BITS) + ack_APIC_irq(); + return true; + } + + return !bitmap_empty(irr->map, APIC_IR_BITS); +} + +/* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now the CPU has serviced that pending interrupt and it + * might not have done the ack_APIC_irq() because it thought, interrupt + * came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear + * the ISR bit and cpu thinks it has already serivced the interrupt. Hence + * a vector might get locked. It was noticed for timer irq (vector + * 0x31). Issue an extra EOI to clear ISR. + * + * If there are pending IRR bits they turn into ISR bits after a higher + * priority ISR bit has been acked. + */ +static void apic_pending_intr_clear(void) +{ + union apic_ir irr, isr; + unsigned int i; + + /* 512 loops are way oversized and give the APIC a chance to obey. */ + for (i = 0; i < 512; i++) { + if (!apic_check_and_ack(&irr, &isr)) + return; + } + /* Dump the IRR/ISR content if that failed */ + pr_warn("APIC: Stale IRR: %256pb ISR: %256pb\n", irr.map, isr.map); } /** @@ -1555,16 +1586,20 @@ static void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif - if (disable_apic) { disable_ioapic_support(); return; } + /* + * If this comes from kexec/kcrash the APIC might be enabled in + * SPIV. Soft disable it before doing further initialization. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + #ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (lapic_is_integrated() && apic->disable_esr) { @@ -1574,8 +1609,6 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_events_lapic_init(); - /* * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. @@ -1590,26 +1623,35 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* - * Set Task Priority to 'accept all'. We never change this - * later on. + * Set Task Priority to 'accept all except vectors 0-31'. An APIC + * vector in the 16-31 range could be delivered if TPR == 0, but we + * would think it's an exception and terrible things will happen. We + * never change this later on. */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; + value |= 0x10; apic_write(APIC_TASKPRI, value); + /* Clear eventually stale ISR/IRR bits */ apic_pending_intr_clear(); /* @@ -1656,6 +1698,8 @@ static void setup_local_APIC(void) value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); + perf_events_lapic_init(); + /* * Set up LVT0, LVT1: * diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bbdca603f94a..7862b152a052 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -8,21 +8,14 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/acpi.h> -#include <linux/errno.h> -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/hardirq.h> #include <linux/export.h> +#include <linux/acpi.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include <asm/apic.h> -#include <asm/apic_flat_64.h> #include <asm/jailhouse_para.h> +#include <asm/apic.h> + +#include "local.h" static struct apic apic_physflat; static struct apic apic_flat; @@ -83,35 +76,6 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) _flat_send_IPI_mask(mask, vector); } -static void flat_send_IPI_allbutself(int vector) -{ - int cpu = smp_processor_id(); - - if (IS_ENABLED(CONFIG_HOTPLUG_CPU) || vector == NMI_VECTOR) { - if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { - unsigned long mask = cpumask_bits(cpu_online_mask)[0]; - - if (cpu < BITS_PER_LONG) - __clear_bit(cpu, &mask); - - _flat_send_IPI_mask(mask, vector); - } - } else if (num_online_cpus() > 1) { - __default_send_IPI_shortcut(APIC_DEST_ALLBUT, - vector, apic->dest_logical); - } -} - -static void flat_send_IPI_all(int vector) -{ - if (vector == NMI_VECTOR) { - flat_send_IPI_mask(cpu_online_mask, vector); - } else { - __default_send_IPI_shortcut(APIC_DEST_ALLINC, - vector, apic->dest_logical); - } -} - static unsigned int flat_get_apic_id(unsigned long x) { return (x >> 24) & 0xFF; @@ -173,9 +137,9 @@ static struct apic apic_flat __ro_after_init = { .send_IPI = default_send_IPI_single, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, - .send_IPI_allbutself = flat_send_IPI_allbutself, - .send_IPI_all = flat_send_IPI_all, - .send_IPI_self = apic_send_IPI_self, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, .inquire_remote_apic = default_inquire_remote_apic, @@ -225,16 +189,6 @@ static void physflat_init_apic_ldr(void) */ } -static void physflat_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void physflat_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8 || @@ -276,9 +230,9 @@ static struct apic apic_physflat __ro_after_init = { .send_IPI = default_send_IPI_single_phys, .send_IPI_mask = default_send_IPI_mask_sequence_phys, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_phys, - .send_IPI_allbutself = physflat_send_IPI_allbutself, - .send_IPI_all = physflat_send_IPI_all, - .send_IPI_self = apic_send_IPI_self, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, .inquire_remote_apic = default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 5078b5ce63a7..98c9bb75d185 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -9,25 +9,9 @@ * to not uglify the caller's code and allow to call (some) apic routines * like self-ipi, etc... */ - -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/errno.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/apicdef.h> -#include <asm/apic.h> -#include <asm/setup.h> -#include <linux/smp.h> -#include <asm/ipi.h> - -#include <linux/interrupt.h> -#include <asm/acpi.h> -#include <asm/e820/api.h> +#include <asm/apic.h> static void noop_init_apic_ldr(void) { } static void noop_send_IPI(int cpu, int vector) { } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a5464b8b6c46..cdf45b4700f2 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -10,15 +10,15 @@ * Send feedback to <support@numascale.com> * */ - +#include <linux/types.h> #include <linux/init.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> -#include <asm/ipi.h> -#include <asm/apic_flat_64.h> + #include <asm/pgtable.h> -#include <asm/pci_x86.h> + +#include "local.h" u8 numachip_system __read_mostly; static const struct apic apic_numachip1; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index caedd8d60d36..38b5b51d42f6 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -4,18 +4,13 @@ * * Drives the local APIC in "clustered mode". */ -#include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/kernel.h> -#include <linux/init.h> #include <linux/dmi.h> #include <linux/smp.h> -#include <asm/apicdef.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/ipi.h> + +#include "local.h" static unsigned bigsmp_get_apic_id(unsigned long x) { diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 82f9244fe61f..6ca0f91372fd 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,24 +1,113 @@ // SPDX-License-Identifier: GPL-2.0 + #include <linux/cpumask.h> -#include <linux/interrupt.h> - -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/cache.h> -#include <linux/cpu.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/apic.h> -#include <asm/proto.h> -#include <asm/ipi.h> - -void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +#include <linux/smp.h> + +#include "local.h" + +DEFINE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +#ifdef CONFIG_SMP +static int apic_ipi_shorthand_off __ro_after_init; + +static __init int apic_ipi_shorthand(char *str) +{ + get_option(&str, &apic_ipi_shorthand_off); + return 1; +} +__setup("no_ipi_broadcast=", apic_ipi_shorthand); + +static int __init print_ipi_mode(void) +{ + pr_info("IPI shorthand broadcast: %s\n", + apic_ipi_shorthand_off ? "disabled" : "enabled"); + return 0; +} +late_initcall(print_ipi_mode); + +void apic_smt_update(void) +{ + /* + * Do not switch to broadcast mode if: + * - Disabled on the command line + * - Only a single CPU is online + * - Not all present CPUs have been at least booted once + * + * The latter is important as the local APIC might be in some + * random state and a broadcast might cause havoc. That's + * especially true for NMI broadcasting. + */ + if (apic_ipi_shorthand_off || num_online_cpus() == 1 || + !cpumask_equal(cpu_present_mask, &cpus_booted_once_mask)) { + static_branch_disable(&apic_use_ipi_shorthand); + } else { + static_branch_enable(&apic_use_ipi_shorthand); + } +} + +void apic_send_IPI_allbutself(unsigned int vector) +{ + if (num_online_cpus() < 2) + return; + + if (static_branch_likely(&apic_use_ipi_shorthand)) + apic->send_IPI_allbutself(vector); + else + apic->send_IPI_mask_allbutself(cpu_online_mask, vector); +} + +/* + * Send a 'reschedule' IPI to another CPU. It goes straight through and + * wastes no time serializing anything. Worst case is that we lose a + * reschedule ... + */ +void native_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); + return; + } + apic->send_IPI(cpu, RESCHEDULE_VECTOR); +} + +void native_send_call_func_single_ipi(int cpu) +{ + apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); +} + +void native_send_call_func_ipi(const struct cpumask *mask) +{ + if (static_branch_likely(&apic_use_ipi_shorthand)) { + unsigned int cpu = smp_processor_id(); + + if (!cpumask_or_equal(mask, cpumask_of(cpu), cpu_online_mask)) + goto sendmask; + + if (cpumask_test_cpu(cpu, mask)) + apic->send_IPI_all(CALL_FUNCTION_VECTOR); + else if (num_online_cpus() > 1) + apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + return; + } + +sendmask: + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); +} + +#endif /* CONFIG_SMP */ + +static inline int __prepare_ICR2(unsigned int mask) +{ + return SET_APIC_DEST_FIELD(mask); +} + +static inline void __xapic_wait_icr_idle(void) +{ + while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector) { /* * Subtle. In the case of the 'never do double writes' workaround @@ -32,12 +121,16 @@ void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int /* * Wait for idle. */ - __xapic_wait_icr_idle(); + if (unlikely(vector == NMI_VECTOR)) + safe_apic_wait_icr_idle(); + else + __xapic_wait_icr_idle(); /* - * No need to touch the target chip field + * No need to touch the target chip field. Also the destination + * mode is ignored when a shorthand is used. */ - cfg = __prepare_ICR(shortcut, vector, dest); + cfg = __prepare_ICR(shortcut, vector, 0); /* * Send the IPI. The write to APIC_ICR fires this off. @@ -133,6 +226,21 @@ void default_send_IPI_single(int cpu, int vector) apic->send_IPI_mask(cpumask_of(cpu), vector); } +void default_send_IPI_allbutself(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, vector); +} + +void default_send_IPI_all(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_ALLINC, vector); +} + +void default_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector); +} + #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, @@ -192,28 +300,6 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_restore(flags); } -void default_send_IPI_allbutself(int vector) -{ - /* - * if there are no other CPUs in the system then we get an APIC send - * error if we try to broadcast, thus avoid sending IPIs in this case. - */ - if (!(num_online_cpus() > 1)) - return; - - __default_local_send_IPI_allbutself(vector); -} - -void default_send_IPI_all(int vector) -{ - __default_local_send_IPI_all(vector); -} - -void default_send_IPI_self(int vector) -{ - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical); -} - /* must come after the send_IPI functions above for inlining */ static int convert_apicid_to_cpu(int apic_id) { diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h new file mode 100644 index 000000000000..04797f05ce94 --- /dev/null +++ b/arch/x86/kernel/apic/local.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Historical copyright notices: + * + * Copyright 2004 James Cleverdon, IBM. + * (c) 1995 Alan Cox, Building #3 <alan@redhat.com> + * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + */ + +#include <linux/jump_label.h> + +#include <asm/apic.h> + +/* APIC flat 64 */ +void flat_init_apic_ldr(void); + +/* X2APIC */ +int x2apic_apic_id_valid(u32 apicid); +int x2apic_apic_id_registered(void); +void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); +unsigned int x2apic_get_apic_id(unsigned long id); +u32 x2apic_set_apic_id(unsigned int id); +int x2apic_phys_pkg_id(int initial_apicid, int index_msb); +void x2apic_send_IPI_self(int vector); +void __x2apic_send_IPI_shorthand(int vector, u32 which); + +/* IPI */ + +DECLARE_STATIC_KEY_FALSE(apic_use_ipi_shorthand); + +static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector, + unsigned int dest) +{ + unsigned int icr = shortcut | dest; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; +} + +void __default_send_IPI_shortcut(unsigned int shortcut, int vector); + +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest); + +void default_send_IPI_single(int cpu, int vector); +void default_send_IPI_single_phys(int cpu, int vector); +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, int vector); +void default_send_IPI_allbutself(int vector); +void default_send_IPI_all(int vector); +void default_send_IPI_self(int vector); + +#ifdef CONFIG_X86_32 +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); +void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); +#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ee2d91e382f1..67b33d67002f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -6,51 +6,14 @@ * * Generic x86 APIC driver probe layer. */ -#include <linux/threads.h> -#include <linux/cpumask.h> #include <linux/export.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/init.h> #include <linux/errno.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/apicdef.h> -#include <asm/apic.h> -#include <asm/setup.h> - #include <linux/smp.h> -#include <asm/ipi.h> -#include <linux/interrupt.h> +#include <asm/apic.h> #include <asm/acpi.h> -#include <asm/e820/api.h> -#ifdef CONFIG_HOTPLUG_CPU -#define DEFAULT_SEND_IPI (1) -#else -#define DEFAULT_SEND_IPI (0) -#endif - -int no_broadcast = DEFAULT_SEND_IPI; - -static __init int no_ipi_broadcast(char *str) -{ - get_option(&str, &no_broadcast); - pr_info("Using %s mode\n", - no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); - return 1; -} -__setup("no_ipi_broadcast=", no_ipi_broadcast); - -static int __init print_ipi_mode(void) -{ - pr_info("Using IPI %s mode\n", - no_broadcast ? "No-Shortcut" : "Shortcut"); - return 0; -} -late_initcall(print_ipi_mode); +#include "local.h" static int default_x86_32_early_logical_apicid(int cpu) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index e6560a02eb46..29f0e0984557 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -8,19 +8,9 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/hardirq.h> -#include <linux/dmar.h> - -#include <asm/smp.h> #include <asm/apic.h> -#include <asm/ipi.h> -#include <asm/setup.h> + +#include "local.h" /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. @@ -46,13 +36,6 @@ void __init default_setup_apic_routing(void) x86_platform.apic_post_init(); } -/* Same for both flat and physical. */ - -void apic_send_IPI_self(int vector) -{ - __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); -} - int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { struct apic **drv; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fdacb864c3dd..2c5676b0a6e7 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -398,6 +398,17 @@ static int activate_reserved(struct irq_data *irqd) if (!irqd_can_reserve(irqd)) apicd->can_reserve = false; } + + /* + * Check to ensure that the effective affinity mask is a subset + * the user supplied affinity mask, and warn the user if it is not + */ + if (!cpumask_subset(irq_data_get_effective_affinity_mask(irqd), + irq_data_get_affinity_mask(irqd))) { + pr_warn("irq %u: Affinity broken due to vector space exhaustion.\n", + irqd->irq); + } + return ret; } diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h deleted file mode 100644 index a49b3604027f..000000000000 --- a/arch/x86/kernel/apic/x2apic.h +++ /dev/null @@ -1,9 +0,0 @@ -/* Common bits for X2APIC cluster/physical modes. */ - -int x2apic_apic_id_valid(u32 apicid); -int x2apic_apic_id_registered(void); -void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); -unsigned int x2apic_get_apic_id(unsigned long id); -u32 x2apic_set_apic_id(unsigned int id); -int x2apic_phys_pkg_id(int initial_apicid, int index_msb); -void x2apic_send_IPI_self(int vector); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 609e499387a1..b0889c48a2ac 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -1,15 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/threads.h> + +#include <linux/cpuhotplug.h> #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/dmar.h> -#include <linux/irq.h> -#include <linux/cpu.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/apic.h> -#include <asm/smp.h> -#include "x2apic.h" +#include "local.h" struct cluster_mask { unsigned int clusterid; @@ -84,12 +82,12 @@ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) static void x2apic_send_IPI_allbutself(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } static u32 x2apic_calc_apicid(unsigned int cpu) @@ -158,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) { struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + if (cmsk) + cpumask_clear_cpu(dead_cpu, &cmsk->mask); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index b5cf9e7b3830..bc9693841353 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -1,14 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/threads.h> + #include <linux/cpumask.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/dmar.h> +#include <linux/acpi.h> -#include <asm/smp.h> -#include <asm/ipi.h> -#include "x2apic.h" +#include "local.h" int x2apic_phys; @@ -80,12 +75,12 @@ static void static void x2apic_send_IPI_allbutself(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); + __x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC); } static void init_x2apic_ldr(void) @@ -117,6 +112,14 @@ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) native_x2apic_icr_write(cfg, apicid); } +void __x2apic_send_IPI_shorthand(int vector, u32 which) +{ + unsigned long cfg = __prepare_ICR(which, vector, 0); + + x2apic_wrmsr_fence(); + native_x2apic_icr_write(cfg, 0); +} + unsigned int x2apic_get_apic_id(unsigned long id) { return id; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1e225528f0d7..e6230af19864 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -7,42 +7,22 @@ * * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ +#include <linux/crash_dump.h> +#include <linux/cpuhotplug.h> #include <linux/cpumask.h> -#include <linux/hardirq.h> #include <linux/proc_fs.h> -#include <linux/threads.h> -#include <linux/kernel.h> +#include <linux/memory.h> #include <linux/export.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/sched.h> -#include <linux/timer.h> -#include <linux/slab.h> -#include <linux/cpu.h> -#include <linux/init.h> -#include <linux/io.h> #include <linux/pci.h> -#include <linux/kdebug.h> -#include <linux/delay.h> -#include <linux/crash_dump.h> -#include <linux/reboot.h> -#include <linux/memory.h> -#include <linux/numa.h> +#include <asm/e820/api.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> -#include <asm/current.h> -#include <asm/pgtable.h> #include <asm/uv/bios.h> #include <asm/uv/uv.h> #include <asm/apic.h> -#include <asm/e820/api.h> -#include <asm/ipi.h> -#include <asm/smp.h> -#include <asm/x86_init.h> -#include <asm/nmi.h> -DEFINE_PER_CPU(int, x2apic_extra_bits); +static DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static bool uv_hubless_system; diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d3d075226c0a..24d2fde30d00 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -6,13 +6,28 @@ #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, qual) [nr] = 1, +#define __SYSCALL_X32(nr, sym, qual) static char syscalls_64[] = { #include <asm/syscalls_64.h> }; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = 1, +static char syscalls_x32[] = { +#include <asm/syscalls_64.h> +}; +#undef __SYSCALL_64 +#undef __SYSCALL_X32 +#endif + #define __SYSCALL_I386(nr, sym, qual) [nr] = 1, static char syscalls_ia32[] = { #include <asm/syscalls_32.h> }; +#undef __SYSCALL_I386 #if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS) #include <asm/kvm_para.h> @@ -62,7 +77,6 @@ int main(void) ENTRY(cr2); ENTRY(cr3); ENTRY(cr4); - ENTRY(cr8); ENTRY(gdt_desc); BLANK(); #undef ENTRY @@ -80,6 +94,11 @@ int main(void) DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); +#ifdef CONFIG_X86_X32_ABI + DEFINE(__NR_syscall_x32_max, sizeof(syscalls_x32) - 1); + DEFINE(X32_NR_syscalls, sizeof(syscalls_x32)); +#endif + DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1); DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..890f60083eca 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 68c363c341bf..90f75e515876 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/random.h> +#include <linux/topology.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -889,6 +890,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + /* * Fix erratum 1076: CPB feature bit not being set in CPUID. * Always set it, except when running under a hypervisor. @@ -945,12 +950,8 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -958,19 +959,8 @@ static void init_amd(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c6fa3ef10b4e..4c7b0fa15a19 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +106,7 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); arch_smt_update(); @@ -269,6 +271,100 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ + if (taa_mitigation == TAA_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -786,13 +882,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -void arch_smt_update(void) +void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +912,17 @@ void arch_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1253,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1184,15 +1291,15 @@ static void override_cache_bits(struct cpuinfo_x86 *c) case INTEL_FAM6_WESTMERE: case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL_CORE: - case INTEL_FAM6_HASWELL_ULT: - case INTEL_FAM6_HASWELL_GT3E: - case INTEL_FAM6_BROADWELL_CORE: - case INTEL_FAM6_BROADWELL_GT3E: - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_HASWELL: + case INTEL_FAM6_HASWELL_L: + case INTEL_FAM6_HASWELL_G: + case INTEL_FAM6_BROADWELL: + case INTEL_FAM6_BROADWELL_G: + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -1304,11 +1411,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1448,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1533,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1575,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f125bf7ecb6f..fffe21945374 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,26 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1072,15 +1074,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1091,19 +1095,30 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1120,6 +1135,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1553,6 +1583,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1957,3 +1989,14 @@ void microcode_check(void) pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); } + +/* + * Invoked from core CPU hotplug code after hotplug operations + */ +void arch_smt_update(void) +{ + /* Handle the speculative execution misfeatures */ + cpu_bugs_smt_update(); + /* Check whether IPI broadcasting can be enabled */ + apic_smt_update(); +} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..38ab6e115eac 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b5353244749b..3cbe24ca80ab 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -20,54 +20,55 @@ struct cpuid_dep { * but it's difficult to tell that to the init reference checker. */ static const struct cpuid_dep cpuid_deps[] = { - { X86_FEATURE_FXSR, X86_FEATURE_FPU }, - { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, - { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, - { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, - { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, - { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, - { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, - { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, - { X86_FEATURE_MMX, X86_FEATURE_FXSR }, - { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, - { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, - { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM, X86_FEATURE_FXSR }, - { X86_FEATURE_XMM2, X86_FEATURE_XMM }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, - { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, - { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, - { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, - { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, - { X86_FEATURE_AES, X86_FEATURE_XMM2 }, - { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, - { X86_FEATURE_FMA, X86_FEATURE_AVX }, - { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, - { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, - { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, - { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, - { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, - { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 415621ddb8a2..4e28c1fc8749 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -330,12 +330,8 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; - /* - * A serializing LFENCE has less overhead than MFENCE, so - * use it for execution serialization. On families which + * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. @@ -343,19 +339,8 @@ static void init_hygon(struct cpuinfo_x86 *c) msr_set_bit(MSR_F10H_DECFG, MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); - /* - * Verify that the MSR write was successful (could be running - * under a hypervisor) and only then assume that LFENCE is - * serializing. - */ - ret = rdmsrl_safe(MSR_F10H_DECFG, &val); - if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { - /* A serializing LFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - } else { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 24e619d1bf79..4a900804a023 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -142,21 +142,21 @@ struct sku_microcode { u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 }, + { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, + { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, + { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, + { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 }, + { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, + { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, + { INTEL_FAM6_HASWELL, 0x03, 0x23 }, { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, @@ -265,9 +265,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ if (c->x86 == 6) { switch (c->x86_model) { - case 0x27: /* Penwell */ - case 0x35: /* Cloverview */ - case 0x4a: /* Merrifield */ + case INTEL_FAM6_ATOM_SALTWELL_MID: + case INTEL_FAM6_ATOM_SALTWELL_TABLET: + case INTEL_FAM6_ATOM_SILVERMONT_MID: + case INTEL_FAM6_ATOM_AIRMONT_NP: set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: @@ -761,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index e43eb6732630..88cd9598fa57 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -479,7 +479,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) switch (c->x86_model) { case INTEL_FAM6_IVYBRIDGE_X: case INTEL_FAM6_HASWELL_X: - case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_XEON_PHI_KNL: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 210f1f5db5f7..87bcdc6dc2f0 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -107,11 +107,11 @@ static struct severity { */ MCESEV( AO, "Action optional: memory scrubbing error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) + SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB) ), MCESEV( AO, "Action optional: last level cache writeback error", - SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) + SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB) ), /* ignore OVER for UCNA */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 062f77279ce3..c656d92cd708 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -29,6 +29,7 @@ #include <asm/timer.h> #include <asm/reboot.h> #include <asm/nmi.h> +#include <clocksource/hyperv_timer.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); @@ -215,6 +216,10 @@ static void __init ms_hyperv_init_platform(void) int hv_host_info_ecx; int hv_host_info_edx; +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + /* * Extract the features and hints */ @@ -338,6 +343,15 @@ static void __init ms_hyperv_init_platform(void) x2apic_phys = 1; # endif + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); +#endif +} + +void hv_setup_sched_clock(void *sched_clock) +{ +#ifdef CONFIG_PARAVIRT + pv_ops.time.sched_clock = sched_clock; #endif } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..2e3b06d6bbc6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 32b4dc9030aa..c222f283b456 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -17,6 +17,12 @@ */ static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); +u32 get_umwait_control_msr(void) +{ + return umwait_control_cached; +} +EXPORT_SYMBOL_GPL(get_umwait_control_msr); + /* * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by * hardware or BIOS before kernel boot. diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 3c648476d4fb..46d732696c1c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -30,34 +30,69 @@ #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> +#include <asm/vmware.h> #undef pr_fmt #define pr_fmt(fmt) "vmware: " fmt -#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define CPUID_VMWARE_FEATURES_LEAF 0x40000010 +#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) +#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) + #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 -#define VMWARE_HYPERVISOR_PORT 0x5658 -#define VMWARE_PORT_CMD_GETVERSION 10 -#define VMWARE_PORT_CMD_GETHZ 45 -#define VMWARE_PORT_CMD_GETVCPU_INFO 68 -#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 -#define VMWARE_PORT_CMD_VCPU_RESERVED 31 +#define VMWARE_CMD_GETVERSION 10 +#define VMWARE_CMD_GETHZ 45 +#define VMWARE_CMD_GETVCPU_INFO 68 +#define VMWARE_CMD_LEGACY_X2APIC 3 +#define VMWARE_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx)" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "0"(VMWARE_HYPERVISOR_MAGIC), \ - "1"(VMWARE_PORT_CMD_##cmd), \ - "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ - "memory"); + __asm__("inl (%%dx), %%eax" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ + __asm__("vmmcall" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "a"(VMWARE_HYPERVISOR_MAGIC), \ + "c"(VMWARE_CMD_##cmd), \ + "d"(0), "b"(UINT_MAX) : \ + "memory") + +#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ + switch (vmware_hypercall_mode) { \ + case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ + VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ + VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ + break; \ + default: \ + VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ + break; \ + } \ + } while (0) static unsigned long vmware_tsc_khz __ro_after_init; +static u8 vmware_hypercall_mode __ro_after_init; static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } @@ -129,6 +164,10 @@ static void __init vmware_set_capabilities(void) { setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL) + setup_force_cpu_cap(X86_FEATURE_VMCALL); + else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL) + setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL); } static void __init vmware_platform_setup(void) @@ -136,7 +175,7 @@ static void __init vmware_platform_setup(void) uint32_t eax, ebx, ecx, edx; uint64_t lpj, tsc_khz; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); @@ -174,10 +213,21 @@ static void __init vmware_platform_setup(void) vmware_set_capabilities(); } +static u8 vmware_select_hypercall(void) +{ + int eax, ebx, ecx, edx; + + cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx); + return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL | + CPUID_VMWARE_FEATURES_ECX_VMCALL)); +} + /* * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. + * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode + * intentionally defaults to 0. */ static uint32_t __init vmware_platform(void) { @@ -187,8 +237,16 @@ static uint32_t __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); - if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) { + if (eax >= CPUID_VMWARE_FEATURES_LEAF) + vmware_hypercall_mode = + vmware_select_hypercall(); + + pr_info("hypercall mode: 0x%02x\n", + (unsigned int) vmware_hypercall_mode); + return CPUID_VMWARE_INFO_LEAF; + } } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) return 1; @@ -200,9 +258,9 @@ static uint32_t __init vmware_platform(void) static bool __init vmware_legacy_x2apic_available(void) { uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); - return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && - (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; + VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0; } const __initconst struct hypervisor_x86 x86_hyper_vmware = { diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2bf70a2fed90..eb651fbde92a 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -225,8 +225,6 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) if (crashk_low_res.end) { ret = crash_exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); - if (ret) - return ret; } return ret; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 22369dd5de3b..045e82e8945b 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, { return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); } + +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0, sev_active()); +} diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2b5886401e5f..e07424e19274 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -367,13 +367,18 @@ NOKPROBE_SYMBOL(oops_end); int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk(KERN_DEFAULT "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 753b8cfe8b8a..87b97897a881 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6c4f01540833..4cba91ec8049 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -549,6 +549,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CNL_IDS(&gen9_early_ops), INTEL_ICL_11_IDS(&gen11_early_ops), INTEL_EHL_IDS(&gen11_early_ops), + INTEL_TGL_12_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -709,6 +710,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 29ffa495bd1c..206a4b6144c2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr, * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. */ pmd = fixup_pointer(level2_kernel_pgt, physaddr); - for (i = 0; i < PTRS_PER_PMD; i++) { + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; - } + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; /* * Fixup phys_base - remove the memory encryption mask to obtain diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 4c407833faca..4d4f5d9faac3 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -74,9 +74,9 @@ bool arch_ima_get_secureboot(void) /* secureboot arch rules */ static const char * const sb_arch_rules[] = { -#if !IS_ENABLED(CONFIG_KEXEC_VERIFY_SIG) +#if !IS_ENABLED(CONFIG_KEXEC_SIG) "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig", -#endif /* CONFIG_KEXEC_VERIFY_SIG */ +#endif /* CONFIG_KEXEC_SIG */ "measure func=KEXEC_KERNEL_CHECK", #if !IS_ENABLED(CONFIG_MODULE_SIG) "appraise func=MODULE_CHECK appraise_type=imasig", diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 0fe1c8782208..61a89d3c0382 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -11,6 +11,7 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/ioport.h> +#include <linux/security.h> #include <linux/smp.h> #include <linux/stddef.h> #include <linux/slab.h> @@ -31,7 +32,8 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT))) return -EPERM; /* @@ -126,7 +128,8 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + security_locked_down(LOCKDOWN_IOPORT)) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4215653f8a8e..21efee32e2b1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -243,11 +243,15 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); - - if (!handle_irq(desc, regs)) { + if (likely(!IS_ERR_OR_NULL(desc))) { + if (IS_ENABLED(CONFIG_X86_32)) + handle_irq(desc, regs); + else + generic_handle_irq_desc(desc); + } else { ack_APIC_irq(); - if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) { + if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), vector); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index fc34816c6f04..a759ca97cd01 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,18 +148,13 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); - if (IS_ERR_OR_NULL(desc)) - return false; - if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) print_stack_overflow(); generic_handle_irq_desc(desc); } - - return true; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6bf6517a05bb..12df3a4abfdd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,15 +26,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; DECLARE_INIT_PER_CPU(irq_stack_backing_store); -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) -{ - if (IS_ERR_OR_NULL(desc)) - return false; - - generic_handle_irq_desc(desc); - return true; -} - #ifdef CONFIG_VMAP_STACK /* * VMAP the backing store with guard pages diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 5ebcd02cbca7..d2f4e706a428 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -180,6 +180,7 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 23297ea64f5f..c44fe7d8d9a4 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -416,7 +416,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) */ void kgdb_roundup_cpus(void) { - apic->send_IPI_allbutself(APIC_DM_NMI); + apic_send_IPI_allbutself(NMI_VECTOR); } #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0e0b08008b5a..43fc13c831af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -580,7 +580,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, if (setup_detour_execution(p, regs, reenter)) return; -#if !defined(CONFIG_PREEMPT) +#if !defined(CONFIG_PREEMPTION) if (p->ainsn.boostable && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ if (!reenter) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 9d4aedece363..b348dd506d58 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -403,7 +403,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += RELATIVEJUMP_SIZE; - /* We have to use text_poke for instuction buffer because it is RO */ + /* We have to use text_poke() for instruction buffer because it is RO */ text_poke(slot, buf, len); ret = 0; out: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4ab377c9fffe..e820568ed4d5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -311,7 +311,7 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif pa |= KVM_ASYNC_PF_ENABLED; @@ -502,16 +502,6 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } -static void kvm_send_ipi_allbutself(int vector) -{ - kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); -} - -static void kvm_send_ipi_all(int vector) -{ - __send_ipi_mask(cpu_online_mask, vector); -} - /* * Set the IPI entry points */ @@ -519,8 +509,6 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = kvm_send_ipi_allbutself; - apic->send_IPI_all = kvm_send_ipi_all; pr_info("KVM setup pv IPIs\n"); } @@ -705,6 +693,7 @@ unsigned int kvm_arch_para_hints(void) { return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); } +EXPORT_SYMBOL_GPL(kvm_arch_para_hints); static uint32_t __init kvm_detect(void) { @@ -867,3 +856,39 @@ void __init kvm_spinlock_init(void) } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + pr_err_once("kvm: host does not support poll control\n"); + pr_err_once("kvm: host upgrade recommended\n"); + return; + } + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 3db2252b958d..1547be359d7f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -34,6 +34,7 @@ #include <linux/notifier.h> #include <linux/uaccess.h> #include <linux/gfp.h> +#include <linux/security.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -79,6 +80,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, int err = 0; ssize_t bytes = 0; + err = security_locked_down(LOCKDOWN_MSR); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -130,6 +135,9 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EFAULT; break; } + err = security_locked_down(LOCKDOWN_MSR); + if (err) + break; err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4df7705022b9..e676a9916c49 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -512,6 +512,9 @@ NOKPROBE_SYMBOL(is_debug_stack); dotraplinkage notrace void do_nmi(struct pt_regs *regs, long error_code) { + if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) + return; + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { this_cpu_write(nmi_state, NMI_LATCHED); return; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 0aa6256eedd8..59d3d2763a9e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -311,10 +311,6 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, -#ifdef CONFIG_X86_64 - .cpu.read_cr8 = native_read_cr8, - .cpu.write_cr8 = native_write_cr8, -#endif .cpu.wbinvd = native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 9d4343aa481b..23fdec030c37 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -468,6 +468,8 @@ static const struct dma_map_ops calgary_dma_ops = { .map_page = calgary_map_page, .unmap_page = calgary_unmap_page, .dma_supported = dma_direct_supported, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, }; static inline void __iomem * busno_to_bbar(unsigned char num) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f62b498b18fb..fa4352dce491 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/dma-direct.h> #include <linux/dma-debug.h> +#include <linux/iommu.h> #include <linux/dmar.h> #include <linux/export.h> #include <linux/memblock.h> @@ -34,21 +35,6 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -/* - * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA translation for - * devices and allow every device to access to whole physical memory. This is - * useful if a user wants to use an IOMMU only for KVM device assignment to - * guests and not for driver dma translation. - * It is also possible to disable by default in kernel config, and enable with - * iommu=nopt at boot time. - */ -#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH -int iommu_pass_through __read_mostly = 1; -#else -int iommu_pass_through __read_mostly; -#endif - extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; void __init pci_iommu_alloc(void) @@ -120,9 +106,9 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif if (!strncmp(p, "pt", 2)) - iommu_pass_through = 1; + iommu_set_default_passthrough(true); if (!strncmp(p, "nopt", 4)) - iommu_pass_through = 0; + iommu_set_default_translated(true); gart_parse_options(p); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 5f5302028a9a..c2cfa5e7c152 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* Glue code to lib/swiotlb.c */ #include <linux/pci.h> #include <linux/cache.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 75fea0d48c0e..5e94c4354d4e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -580,7 +580,7 @@ void __cpuidle default_idle(void) safe_halt(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } -#ifdef CONFIG_APM_MODULE +#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); #endif diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 320ab978fb1f..1d0797b2338a 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // // Code shared between 32 and 64 bit diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 8451f38ad399..1daf8f2aa21f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -90,8 +90,6 @@ static void ich_force_hpet_resume(void) BUG(); else printk(KERN_DEBUG "Force enabled HPET at resume\n"); - - return; } static void ich_force_enable_hpet(struct pci_dev *dev) @@ -448,7 +446,6 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev) dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } /* ISA Bridges */ @@ -513,7 +510,6 @@ static void e6xx_force_enable_hpet(struct pci_dev *dev) force_hpet_resume_type = NONE_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " "0x%lx\n", force_hpet_address); - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU, e6xx_force_enable_hpet); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 09d6bded3c1e..0cc7c0b106bb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -828,11 +828,6 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) return NMI_HANDLED; } -static void smp_send_nmi_allbutself(void) -{ - apic->send_IPI_allbutself(NMI_VECTOR); -} - /* * Halt all other CPUs, calling the specified function on each of them * @@ -861,7 +856,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) */ wmb(); - smp_send_nmi_allbutself(); + apic_send_IPI_allbutself(NMI_VECTOR); /* Kick CPUs looping in NMI context. */ WRITE_ONCE(crash_ipi_issued, 1); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbe35bf879f5..77ea96b794bd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -486,7 +486,7 @@ static int __init reserve_crashkernel_low(void) ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret) { /* - * two parts from lib/swiotlb.c: + * two parts from kernel/dma/swiotlb.c: * -swiotlb size: user-specified with swiotlb= or default. * * -swiotlb overflow buffer: now hardcoded to 32k. We round it diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96421f97e75c..b8d4e9c3c070 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -115,46 +115,6 @@ static atomic_t stopping_cpu = ATOMIC_INIT(-1); static bool smp_no_nmi_ipi = false; -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ -static void native_smp_send_reschedule(int cpu) -{ - if (unlikely(cpu_is_offline(cpu))) { - WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", cpu); - return; - } - apic->send_IPI(cpu, RESCHEDULE_VECTOR); -} - -void native_send_call_func_single_ipi(int cpu) -{ - apic->send_IPI(cpu, CALL_FUNCTION_SINGLE_VECTOR); -} - -void native_send_call_func_ipi(const struct cpumask *mask) -{ - cpumask_var_t allbutself; - - if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - return; - } - - cpumask_copy(allbutself, cpu_online_mask); - __cpumask_clear_cpu(smp_processor_id(), allbutself); - - if (cpumask_equal(mask, allbutself) && - cpumask_equal(cpu_online_mask, cpu_callout_mask)) - apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); - else - apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); - - free_cpumask_var(allbutself); -} - static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) { /* We are registered on stopping cpu too, avoid spurious NMI */ @@ -179,6 +139,12 @@ asmlinkage __visible void smp_reboot_interrupt(void) irq_exit(); } +static int register_stop_handler(void) +{ + return register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop"); +} + static void native_stop_other_cpus(int wait) { unsigned long flags; @@ -209,42 +175,44 @@ static void native_stop_other_cpus(int wait) /* sync above data before sending IRQ */ wmb(); - apic->send_IPI_allbutself(REBOOT_VECTOR); + apic_send_IPI_allbutself(REBOOT_VECTOR); /* - * Don't wait longer than a second if the caller - * didn't ask us to wait. + * Don't wait longer than a second for IPI completion. The + * wait request is not checked here because that would + * prevent an NMI shutdown attempt in case that not all + * CPUs reach shutdown state. */ timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && (wait || timeout--)) + while (num_online_cpus() > 1 && timeout--) udelay(1); } - - /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - /* Hope the REBOOT_IRQ is good enough */ - goto finish; - /* sync above data before sending IRQ */ - wmb(); - - pr_emerg("Shutting down cpus with NMI\n"); + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if (num_online_cpus() > 1) { + /* + * If NMI IPI is enabled, try to register the stop handler + * and send the IPI. In any case try to wait for the other + * CPUs to stop. + */ + if (!smp_no_nmi_ipi && !register_stop_handler()) { + /* Sync above data before sending IRQ */ + wmb(); - apic->send_IPI_allbutself(NMI_VECTOR); + pr_emerg("Shutting down cpus with NMI\n"); + apic_send_IPI_allbutself(NMI_VECTOR); + } /* - * Don't wait longer than a 10 ms if the caller - * didn't ask us to wait. + * Don't wait longer than 10 ms if the caller didn't + * reqeust it. If wait is true, the machine hangs here if + * one or more CPUs do not reach shutdown state. */ timeout = USEC_PER_MSEC * 10; while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } -finish: local_irq_save(flags); disable_local_APIC(); mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fdbd47ceb84d..69881b2d446c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1023,8 +1023,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, int *cpu0_nmi_registered) { - volatile u32 *trampoline_status = - (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ unsigned long start_ip = real_mode_header->trampoline_start; @@ -1116,9 +1114,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, } } - /* mark "stuck" area as not stuck */ - *trampoline_status = 0; - if (x86_platform.legacy.warm_reset) { /* * Cleanup possible dangling ends... @@ -1596,7 +1591,12 @@ int native_cpu_disable(void) if (ret) return ret; - clear_local_APIC(); + /* + * Disable the local APIC. Otherwise IPI broadcasts will reach + * it. It still responds normally to INIT, NMI, SMI, and SIPI + * messages. + */ + apic_soft_disable(); cpu_disable_common(); return 0; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57d87f79558f..7e322e2daaf5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -638,7 +638,7 @@ unsigned long native_calibrate_tsc(void) * clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X) + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) crystal_khz = 25000; /* @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 067858fe4db8..e0cbe4f2af49 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -58,6 +58,10 @@ static const struct freq_desc freq_desc_ann = { 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }; +static const struct freq_desc freq_desc_lgm = { + 1, { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 } +}; + static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SALTWELL_MID, freq_desc_pnw), INTEL_CPU_FAM6(ATOM_SALTWELL_TABLET, freq_desc_clv), @@ -65,6 +69,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, freq_desc_tng), INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), INTEL_CPU_FAM6(ATOM_AIRMONT_MID, freq_desc_ann), + INTEL_CPU_FAM6(ATOM_AIRMONT_NP, freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5b345add550f..548fefed71ee 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -19,7 +19,7 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) + * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR) * from being executed with CPL > 0. Otherwise, a general protection fault is * issued. * @@ -36,8 +36,8 @@ * DOSEMU2) rely on this subset of instructions to function. * * The instructions protected by UMIP can be split in two groups. Those which - * return a kernel memory address (sgdt and sidt) and those which return a - * value (sldt, str and smsw). + * return a kernel memory address (SGDT and SIDT) and those which return a + * value (SLDT, STR and SMSW). * * For the instructions that return a kernel memory address, applications * such as WineHQ rely on the result being located in the kernel memory space, @@ -45,15 +45,13 @@ * value that, lies close to the top of the kernel memory. The limit for the GDT * and the IDT are set to zero. * - * Given that sldt and str are not commonly used in programs that run on WineHQ + * Given that SLDT and STR are not commonly used in programs that run on WineHQ * or DOSEMU2, they are not emulated. * * The instruction smsw is emulated to return the value that the register CR0 * has at boot time as set in the head_32. * - * Also, emulation is provided only for 32-bit processes; 64-bit processes - * that attempt to use the instructions that UMIP protects will receive the - * SIGSEGV signal issued as a consequence of the general protection fault. + * Emulation is provided for both 32-bit and 64-bit processes. * * Care is taken to appropriately emulate the results when segmentation is * used. That is, rather than relying on USER_DS and USER_CS, the function @@ -63,17 +61,18 @@ * application uses a local descriptor table. */ -#define UMIP_DUMMY_GDT_BASE 0xfffe0000 -#define UMIP_DUMMY_IDT_BASE 0xffff0000 +#define UMIP_DUMMY_GDT_BASE 0xfffffffffffe0000ULL +#define UMIP_DUMMY_IDT_BASE 0xffffffffffff0000ULL /* * The SGDT and SIDT instructions store the contents of the global descriptor * table and interrupt table registers, respectively. The destination is a * memory operand of X+2 bytes. X bytes are used to store the base address of - * the table and 2 bytes are used to store the limit. In 32-bit processes, the - * only processes for which emulation is provided, X has a value of 4. + * the table and 2 bytes are used to store the limit. In 32-bit processes X + * has a value of 4, in 64-bit processes X has a value of 8. */ -#define UMIP_GDT_IDT_BASE_SIZE 4 +#define UMIP_GDT_IDT_BASE_SIZE_64BIT 8 +#define UMIP_GDT_IDT_BASE_SIZE_32BIT 4 #define UMIP_GDT_IDT_LIMIT_SIZE 2 #define UMIP_INST_SGDT 0 /* 0F 01 /0 */ @@ -189,6 +188,7 @@ static int identify_insn(struct insn *insn) * @umip_inst: A constant indicating the instruction to emulate * @data: Buffer into which the dummy result is stored * @data_size: Size of the emulated result + * @x86_64: true if process is 64-bit, false otherwise * * Emulate an instruction protected by UMIP and provide a dummy result. The * result of the emulation is saved in @data. The size of the results depends @@ -202,11 +202,8 @@ static int identify_insn(struct insn *insn) * 0 on success, -EINVAL on error while emulating. */ static int emulate_umip_insn(struct insn *insn, int umip_inst, - unsigned char *data, int *data_size) + unsigned char *data, int *data_size, bool x86_64) { - unsigned long dummy_base_addr, dummy_value; - unsigned short dummy_limit = 0; - if (!data || !data_size || !insn) return -EINVAL; /* @@ -219,6 +216,9 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, * is always returned irrespective of the operand size. */ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) { + u64 dummy_base_addr; + u16 dummy_limit = 0; + /* SGDT and SIDT do not use registers operands. */ if (X86_MODRM_MOD(insn->modrm.value) == 3) return -EINVAL; @@ -228,13 +228,24 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst, else dummy_base_addr = UMIP_DUMMY_IDT_BASE; - *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE; + /* + * 64-bit processes use the entire dummy base address. + * 32-bit processes use the lower 32 bits of the base address. + * dummy_base_addr is always 64 bits, but we memcpy the correct + * number of bytes from it to the destination. + */ + if (x86_64) + *data_size = UMIP_GDT_IDT_BASE_SIZE_64BIT; + else + *data_size = UMIP_GDT_IDT_BASE_SIZE_32BIT; + + memcpy(data + 2, &dummy_base_addr, *data_size); - memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE); + *data_size += UMIP_GDT_IDT_LIMIT_SIZE; memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE); } else if (umip_inst == UMIP_INST_SMSW) { - dummy_value = CR0_STATE; + unsigned long dummy_value = CR0_STATE; /* * Even though the CR0 register has 4 bytes, the number @@ -290,11 +301,10 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) * fixup_umip_exception() - Fixup a general protection fault caused by UMIP * @regs: Registers as saved when entering the #GP handler * - * The instructions sgdt, sidt, str, smsw, sldt cause a general protection - * fault if executed with CPL > 0 (i.e., from user space). If the offending - * user-space process is not in long mode, this function fixes the exception - * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not - * fixed up. Also long mode user-space processes are not fixed up. + * The instructions SGDT, SIDT, STR, SMSW and SLDT cause a general protection + * fault if executed with CPL > 0 (i.e., from user space). This function fixes + * the exception up and provides dummy results for SGDT, SIDT and SMSW; STR + * and SLDT are not fixed up. * * If operands are memory addresses, results are copied to user-space memory as * indicated by the instruction pointed by eIP using the registers indicated in @@ -373,13 +383,14 @@ bool fixup_umip_exception(struct pt_regs *regs) umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); - /* Do not emulate SLDT, STR or user long mode processes. */ - if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs)) + /* Do not emulate (spoof) SLDT or STR. */ + if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); - if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size)) + if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, + user_64bit_mode(regs))) return false; /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1bef687faf22..18a799c8fa28 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -95,6 +95,7 @@ struct x86_init_ops x86_init __initdata = { }, .acpi = { + .set_root_pointer = x86_default_set_root_pointer, .get_root_pointer = x86_default_get_root_pointer, .reduced_hw_early_init = acpi_generic_reduced_hw_init, }, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 22c2720cd948..f68c0c753c38 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -304,7 +304,13 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, case 7: case 0xb: case 0xd: + case 0xf: + case 0x10: + case 0x12: case 0x14: + case 0x17: + case 0x18: + case 0x1f: case 0x8000001d: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; break; @@ -357,10 +363,10 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -392,6 +398,12 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + entry->edx |= F(SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->edx |= F(INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->edx |= F(SPEC_CTRL_SSBD); /* * We emulate ARCH_CAPABILITIES in software even * if the host doesn't support it. @@ -473,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); @@ -606,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=<function>, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } @@ -729,18 +746,23 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); /* - * IBRS, IBPB and VIRT_SSBD aren't necessarily present in - * hardware cpuid + * AMD has separate bits for each SPEC_CTRL bit. + * arch/x86/kernel/cpu/bugs.c is kind enough to + * record that in cpufeatures so use them. */ - if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + if (boot_cpu_has(X86_FEATURE_IBPB)) entry->ebx |= F(AMD_IBPB); - if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + if (boot_cpu_has(X86_FEATURE_IBRS)) entry->ebx |= F(AMD_IBRS); - if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - entry->ebx |= F(VIRT_SSBD); - entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_STIBP)) + entry->ebx |= F(AMD_STIBP); + if (boot_cpu_has(X86_FEATURE_SSBD)) + entry->ebx |= F(AMD_SSBD); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + entry->ebx |= F(AMD_SSB_NO); /* * The preference is to use SPEC CTRL MSR instead of the * VIRT_SPEC MSR. @@ -952,53 +974,66 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + struct kvm_cpuid_entry2 *max; + + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; - - best = kvm_find_cpuid_entry(vcpu, function, index); + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - if (!best) { - entry_found = false; - if (!check_limit) - goto out; - - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. + */ + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 718f7d9afedc..698efb8c3897 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4156,6 +4156,20 @@ out: return rc; } +static int em_xsetbv(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ecx, edx; + + eax = reg_read(ctxt, VCPU_REGS_RAX); + edx = reg_read(ctxt, VCPU_REGS_RDX); + ecx = reg_read(ctxt, VCPU_REGS_RCX); + + if (ctxt->ops->set_xcr(ctxt, ecx, ((u64)edx << 32) | eax)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4409,6 +4423,12 @@ static const struct opcode group7_rm1[] = { N, N, N, N, N, N, }; +static const struct opcode group7_rm2[] = { + N, + II(ImplicitOps | Priv, em_xsetbv, xsetbv), + N, N, N, N, N, N, +}; + static const struct opcode group7_rm3[] = { DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), II(SrcNone | Prot | EmulateOnUD, em_hypercall, vmmcall), @@ -4498,7 +4518,8 @@ static const struct group_dual group7 = { { }, { EXT(0, group7_rm0), EXT(0, group7_rm1), - N, EXT(0, group7_rm3), + EXT(0, group7_rm2), + EXT(0, group7_rm3), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), @@ -5144,7 +5165,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) else { rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) - return rc; + goto done; } switch (mode) { @@ -5395,6 +5416,8 @@ done_prefixes: ctxt->memopp->addr.mem.ea + ctxt->_eip); done: + if (rc == X86EMUL_PROPAGATE_FAULT) + ctxt->have_exception = true; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fff790a3f4ee..23ff65504d7e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -23,6 +23,7 @@ #include "ioapic.h" #include "hyperv.h" +#include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> @@ -645,7 +646,9 @@ static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) .vector = stimer->config.apic_vector }; - return !kvm_apic_set_irq(vcpu, &irq, NULL); + if (lapic_in_kernel(vcpu)) + return !kvm_apic_set_irq(vcpu, &irq, NULL); + return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) @@ -1852,7 +1855,13 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; - ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; + + /* + * Direct Synthetic timers only make sense with in-kernel + * LAPIC + */ + if (lapic_in_kernel(vcpu)) + ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; @@ -1864,7 +1873,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - + if (!cpu_smt_possible()) + ent->eax |= HV_X64_NO_NONARCH_CORESHARING; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e904ff06a83d..b29d00b661ff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -65,8 +65,11 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +static bool lapic_timer_advance_dynamic __read_mostly; +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -108,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline u8 kvm_xapic_id(struct kvm_lapic *apic) -{ - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; @@ -1198,10 +1196,8 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); -static void apic_send_ipi(struct kvm_lapic *apic) +static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { - u32 icr_low = kvm_lapic_get_reg(apic, APIC_ICR); - u32 icr_high = kvm_lapic_get_reg(apic, APIC_ICR2); struct kvm_lapic_irq irq; irq.vector = icr_low & APIC_VECTOR_MASK; @@ -1487,26 +1483,25 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 ns; + /* Do not adjust for tiny fluctuations or large random spikes. */ + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_ADJUST_MAX || + abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_MIN) + return; + /* too early */ if (advance_expire_delta < 0) { ns = -advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns -= min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } else { /* too late */ ns = advance_expire_delta * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - timer_advance_ns += min((u32)ns, - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - apic->lapic_timer.timer_advance_adjust_done = true; - if (unlikely(timer_advance_ns > 5000)) { - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; - } + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -1526,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) + if (lapic_timer_advance_dynamic) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } @@ -1598,7 +1593,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); - hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_HARD); } else apic_timer_expired(apic); @@ -1914,8 +1909,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) } case APIC_ICR: /* No delay here, so we always clear the pending bit */ - kvm_lapic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); - apic_send_ipi(apic); + val &= ~(1 << 12); + apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); + kvm_lapic_set_reg(apic, APIC_ICR, val); break; case APIC_ICR2: @@ -2299,17 +2295,16 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; - apic->lapic_timer.timer_advance_adjust_done = false; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; + lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; - apic->lapic_timer.timer_advance_adjust_done = true; + lapic_timer_advance_dynamic = false; } - /* * APIC is created enabled. This will prevent kvm_lapic_set_base from * thinking that APIC state has changed. @@ -2484,7 +2479,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_HARD); } /* @@ -2707,11 +2702,14 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) return; /* - * INITs are latched while in SMM. Because an SMM CPU cannot - * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs - * and delay processing of INIT until the next RSM. + * INITs are latched while CPU is in specific states + * (SMM, VMX non-root mode, SVM with GIF=0). + * Because a CPU cannot be in these states immediately + * after it has processed an INIT signal (and thus in + * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs + * and leave the INIT pending. */ - if (is_smm(vcpu)) { + if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 50053d2b8b7b..1f5014852e20 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -35,7 +35,6 @@ struct kvm_timer { s64 advance_expire_delta; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; - bool timer_advance_adjust_done; }; struct kvm_lapic { @@ -243,4 +242,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 218b277bfda3..2ce9da58611e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/pat.h> @@ -47,6 +48,35 @@ #include <asm/kvm_page_track.h> #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +#ifdef CONFIG_PREEMPT_RT +/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ +static uint __read_mostly nx_huge_pages_recovery_ratio = 0; +#else +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; +#endif + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -83,7 +113,17 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -214,16 +254,16 @@ static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_mmio_value; +static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -299,34 +339,63 @@ static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { + BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; + shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static bool is_mmio_spte(u64 spte) +{ + return (spte & shadow_mmio_mask) == shadow_mmio_value; +} + static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); - return !(spte & shadow_acc_track_value); + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); } static inline u64 spte_shadow_accessed_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } static inline u64 spte_shadow_dirty_mask(u64 spte) { - MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); + MMU_WARN_ON(is_mmio_spte(spte)); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -389,23 +458,16 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; - access &= ACC_WRITE_MASK | ACC_USER_MASK; + access &= shadow_mmio_access_mask; mask |= shadow_mmio_value | access; mask |= gpa | shadow_nonpresent_or_rsvd_mask; mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; - page_header(__pa(sptep))->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } -static bool is_mmio_spte(u64 spte) -{ - return (spte & shadow_mmio_mask) == shadow_mmio_value; -} - static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; @@ -418,8 +480,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte) static unsigned get_mmio_spte_access(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) & ~PAGE_MASK; + return spte & shadow_mmio_access_mask; } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, @@ -461,7 +522,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -1164,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1181,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1589,16 +1668,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1617,10 +1696,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1631,6 +1710,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2095,6 +2179,13 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() + * depends on valid pages being added to the head of the list. See + * comments in kvm_zap_obsolete_pages(). + */ + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; @@ -2244,7 +2335,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, #define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->role.invalid) { \ + if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ @@ -2301,6 +2392,12 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return sp->role.invalid || + unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -2609,7 +2706,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2739,10 +2836,18 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - if (!sp->role.invalid) + /* + * Obsolete pages cannot be used on any vCPUs, see the comment + * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also + * treats invalid shadow pages as being obsolete. + */ + if (!is_obsolete_sp(kvm, sp)) kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return list_unstable; } @@ -2950,7 +3055,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -2962,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3182,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3197,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3207,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3255,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3290,7 +3433,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, } if (unlikely(is_noslot_pfn(pfn))) - vcpu_cache_mmio_info(vcpu, gva, gfn, access); + vcpu_cache_mmio_info(vcpu, gva, gfn, + access & shadow_mmio_access_mask); return false; } @@ -3498,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3536,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4122,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4132,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -4162,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4233,6 +4385,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; if (cached_root_available(vcpu, new_cr3, new_role)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU generation + * number. However, changing the generation number is + * accompanied by KVM_REQ_MMU_RELOAD, which will free + * the root set here and allocate a new one. + */ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -5361,7 +5520,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = 0; - enum emulation_result er; bool direct = vcpu->arch.mmu->direct_map; /* With shadow page tables, fault_address contains a GVA or nGPA. */ @@ -5428,19 +5586,8 @@ emulate: return 1; } - er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); - - switch (er) { - case EMULATE_DONE: - return 1; - case EMULATE_USER_EXIT: - ++vcpu->stat.mmio_exits; - /* fall through */ - case EMULATE_FAIL: - return 0; - default: - BUG(); - } + return x86_emulate_instruction(vcpu, cr2, emulation_type, insn, + insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -5592,13 +5739,13 @@ slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, PT_PAGE_TABLE_LEVEL, lock_flush_tlb); } -static void free_mmu_pages(struct kvm_vcpu *vcpu) +static void free_mmu_pages(struct kvm_mmu *mmu) { - free_page((unsigned long)vcpu->arch.mmu->pae_root); - free_page((unsigned long)vcpu->arch.mmu->lm_root); + free_page((unsigned long)mmu->pae_root); + free_page((unsigned long)mmu->lm_root); } -static int alloc_mmu_pages(struct kvm_vcpu *vcpu) +static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; @@ -5619,9 +5766,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if (!page) return -ENOMEM; - vcpu->arch.mmu->pae_root = page_address(page); + mmu->pae_root = page_address(page); for (i = 0; i < 4; ++i) - vcpu->arch.mmu->pae_root[i] = INVALID_PAGE; + mmu->pae_root[i] = INVALID_PAGE; return 0; } @@ -5629,6 +5776,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { uint i; + int ret; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5646,14 +5794,122 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; - return alloc_mmu_pages(vcpu); + + ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu); + if (ret) + return ret; + + ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu); + if (ret) + goto fail_allocate_root; + + return ret; + fail_allocate_root: + free_mmu_pages(&vcpu->arch.guest_mmu); + return ret; +} + +#define BATCH_ZAP_PAGES 10 +static void kvm_zap_obsolete_pages(struct kvm *kvm) +{ + struct kvm_mmu_page *sp, *node; + int nr_zapped, batch = 0; + +restart: + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + /* + * No obsolete valid page exists before a newly created page + * since active_mmu_pages is a FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; + + /* + * Skip invalid pages with a non-zero root count, zapping pages + * with a non-zero root count will never succeed, i.e. the page + * will get thrown back on active_mmu_pages and we'll get stuck + * in an infinite loop. + */ + if (sp->role.invalid && sp->root_count) + continue; + + /* + * No need to flush the TLB since we're only zapping shadow + * pages with an obsolete generation number and all vCPUS have + * loaded a new root, i.e. the shadow pages being zapped cannot + * be in active use by the guest. + */ + if (batch >= BATCH_ZAP_PAGES && + cond_resched_lock(&kvm->mmu_lock)) { + batch = 0; + goto restart; + } + + if (__kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { + batch += nr_zapped; + goto restart; + } + } + + /* + * Trigger a remote TLB flush before freeing the page tables to ensure + * KVM is not in the middle of a lockless shadow page table walk, which + * may reference the pages. + */ + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); +} + +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +static void kvm_mmu_zap_all_fast(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->slots_lock); + + spin_lock(&kvm->mmu_lock); + trace_kvm_mmu_zap_all_fast(kvm); + + /* + * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is + * held for the entire duration of zapping obsolete pages, it's + * impossible for there to be multiple invalid generations associated + * with *valid* shadow pages at any given time, i.e. there is exactly + * one valid generation and (at most) one invalid generation. + */ + kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; + + /* + * Notify all vcpus to reload its shadow page table and flush TLB. + * Then all vcpus will switch to new shadow page table with the new + * mmu_valid_gen. + * + * Note: we need to do this under the protection of mmu_lock, + * otherwise, vcpu would purge shadow page but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + + kvm_zap_obsolete_pages(kvm); + spin_unlock(&kvm->mmu_lock); +} + +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node) { - kvm_mmu_zap_all(kvm); + kvm_mmu_zap_all_fast(kvm); } void kvm_mmu_init_vm(struct kvm *kvm) @@ -5758,9 +6014,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -5846,7 +6102,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); -static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) +void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -5855,14 +6111,10 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (mmio_only && !sp->mmio_cached) - continue; if (sp->role.invalid && sp->root_count) continue; - if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) { - WARN_ON_ONCE(mmio_only); + if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; - } if (cond_resched_lock(&kvm->mmu_lock)) goto restart; } @@ -5871,11 +6123,6 @@ restart: spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) -{ - return __kvm_mmu_zap_all(kvm, false); -} - void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); @@ -5897,7 +6144,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); - __kvm_mmu_zap_all(kvm, true); + kvm_mmu_zap_all_fast(kvm); } } @@ -5928,16 +6175,24 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); @@ -5997,13 +6252,62 @@ static void kvm_set_mmio_spte_mask(void) if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) mask &= ~1ull; - kvm_mmu_set_mmio_spte_mask(mask, mask); + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); +} + +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + mutex_lock(&kvm->slots_lock); + kvm_mmu_zap_all_fast(kvm); + mutex_unlock(&kvm->slots_lock); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; } int kvm_mmu_module_init(void) { int ret = -ENOMEM; + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave @@ -6071,7 +6375,8 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - free_mmu_pages(vcpu); + free_mmu_pages(&vcpu->arch.root_mmu); + free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); } @@ -6082,3 +6387,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 54c2a377795b..d55674f44a18 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -51,7 +51,7 @@ static inline u64 rsvd_bits(int s, int e) return ((1ULL << (e - s + 1)) - 1) << s; } -void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); +void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); @@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index d8001b4bca05..7ca8831c7d1a 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -8,16 +8,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define KVM_MMU_PAGE_FIELDS \ - __field(__u64, gfn) \ - __field(__u32, role) \ - __field(__u32, root_count) \ +#define KVM_MMU_PAGE_FIELDS \ + __field(__u8, mmu_valid_gen) \ + __field(__u64, gfn) \ + __field(__u32, role) \ + __field(__u32, root_count) \ __field(bool, unsync) -#define KVM_MMU_PAGE_ASSIGN(sp) \ - __entry->gfn = sp->gfn; \ - __entry->role = sp->role.word; \ - __entry->root_count = sp->root_count; \ +#define KVM_MMU_PAGE_ASSIGN(sp) \ + __entry->mmu_valid_gen = sp->mmu_valid_gen; \ + __entry->gfn = sp->gfn; \ + __entry->role = sp->role.word; \ + __entry->root_count = sp->root_count; \ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ @@ -29,8 +31,9 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx l%u %u-byte q%u%s %s%s" \ + trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" \ " %snxe %sad root %u %s%c", \ + __entry->mmu_valid_gen, \ __entry->gfn, role.level, \ role.gpte_is_8_bytes ? 8 : 4, \ role.quadrant, \ @@ -280,6 +283,27 @@ TRACE_EVENT( ); TRACE_EVENT( + kvm_mmu_zap_all_fast, + TP_PROTO(struct kvm *kvm), + TP_ARGS(kvm), + + TP_STRUCT__entry( + __field(__u8, mmu_valid_gen) + __field(unsigned int, mmu_used_pages) + ), + + TP_fast_assign( + __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; + __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + ), + + TP_printk("kvm-mmu-valid-gen %u used_pages %x", + __entry->mmu_valid_gen, __entry->mmu_used_pages + ) +); + + +TRACE_EVENT( check_mmio_spte, TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen), TP_ARGS(spte, kvm_gen, spte_gen), diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 7d5cdb3af594..97b21e7fd013 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0368076a1ef..c5673bda4b66 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -68,10 +68,8 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) #define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_NRIP (1 << 3) #define SVM_FEATURE_TSC_RATE (1 << 4) #define SVM_FEATURE_VMCB_CLEAN (1 << 5) #define SVM_FEATURE_FLUSH_ASID (1 << 6) @@ -736,8 +734,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -770,7 +774,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -780,17 +784,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } if (!svm->next_rip) { - if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) != - EMULATE_DONE) - printk(KERN_DEBUG "%s: NOP\n", __func__); - return; + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } else { + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + pr_err("%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); + kvm_rip_write(vcpu, svm->next_rip); } - if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", - __func__, kvm_rip_read(vcpu), svm->next_rip); - - kvm_rip_write(vcpu, svm->next_rip); svm_set_interrupt_shadow(vcpu, 0); + + return 1; } static void svm_queue_exception(struct kvm_vcpu *vcpu) @@ -821,7 +825,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) * raises a fault that is not intercepted. Still better than * failing in all cases. */ - skip_emulated_instruction(&svm->vcpu); + (void)skip_emulated_instruction(&svm->vcpu); rip = kvm_rip_read(&svm->vcpu); svm->int3_rip = rip + svm->vmcb->save.cs.base; svm->int3_injected = rip - old_rip; @@ -1269,11 +1273,11 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_grow, pause_filter_count_max); - if (control->pause_filter_count != old) + if (control->pause_filter_count != old) { mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - trace_kvm_ple_window_grow(vcpu->vcpu_id, - control->pause_filter_count, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + control->pause_filter_count, old); + } } static void shrink_ple_window(struct kvm_vcpu *vcpu) @@ -1287,11 +1291,11 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) pause_filter_count, pause_filter_count_shrink, pause_filter_count); - if (control->pause_filter_count != old) + if (control->pause_filter_count != old) { mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, - control->pause_filter_count, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + control->pause_filter_count, old); + } } static __init int svm_hardware_setup(void) @@ -1542,6 +1546,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); + set_intercept(svm, INTERCEPT_RDPRU); set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { @@ -2136,6 +2141,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; @@ -2768,17 +2776,18 @@ static int gp_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 error_code = svm->vmcb->control.exit_info_1; - int er; WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) + /* + * VMware backdoor emulation on #GP interception only handles IN{S}, + * OUT{S}, and RDPMC, none of which generate a non-zero error code. + */ + if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; + return 1; + } + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } static bool is_erratum_383(void) @@ -2876,7 +2885,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -2903,13 +2912,11 @@ static int nop_on_interception(struct vcpu_svm *svm) static int halt_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_emulate_hypercall(&svm->vcpu); } @@ -3588,9 +3595,9 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, mark_all_dirty(svm->vmcb); } -static bool nested_svm_vmrun(struct vcpu_svm *svm) +static int nested_svm_vmrun(struct vcpu_svm *svm) { - int rc; + int ret; struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; @@ -3599,13 +3606,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) vmcb_gpa = svm->vmcb->save.rax; - rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); - if (rc) { - if (rc == -EINVAL) - kvm_inject_gp(&svm->vcpu, 0); - return false; + ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map); + if (ret == -EINVAL) { + kvm_inject_gp(&svm->vcpu, 0); + return 1; + } else if (ret) { + return kvm_skip_emulated_instruction(&svm->vcpu); } + ret = kvm_skip_emulated_instruction(&svm->vcpu); + nested_vmcb = map.hva; if (!nested_vmcb_checks(nested_vmcb)) { @@ -3616,7 +3626,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); - return false; + return ret; } trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, @@ -3660,7 +3670,16 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map); - return true; + if (!nested_svm_vmrun_msrpm(svm)) { + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); + } + + return ret; } static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -3697,7 +3716,6 @@ static int vmload_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); @@ -3724,7 +3742,6 @@ static int vmsave_interception(struct vcpu_svm *svm) nested_vmcb = map.hva; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); @@ -3738,27 +3755,7 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - /* Save rip after vmrun instruction */ - kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); - - if (!nested_svm_vmrun(svm)) - return 1; - - if (!nested_svm_vmrun_msrpm(svm)) - goto failed; - - return 1; - -failed: - - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - - return 1; + return nested_svm_vmrun(svm); } static int stgi_interception(struct vcpu_svm *svm) @@ -3775,7 +3772,6 @@ static int stgi_interception(struct vcpu_svm *svm) if (vgif_enabled(svm)) clr_intercept(svm, INTERCEPT_STGI); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -3791,7 +3787,6 @@ static int clgi_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3816,7 +3811,6 @@ static int invlpga_interception(struct vcpu_svm *svm) /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu)); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } @@ -3839,13 +3833,18 @@ static int xsetbv_interception(struct vcpu_svm *svm) u32 index = kvm_rcx_read(&svm->vcpu); if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; return kvm_skip_emulated_instruction(&svm->vcpu); } return 1; } +static int rdpru_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; +} + static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; @@ -3898,25 +3897,20 @@ static int task_switch_interception(struct vcpu_svm *svm) if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT || (int_type == SVM_EXITINTINFO_TYPE_EXEPT && - (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) - skip_emulated_instruction(&svm->vcpu); + (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { + if (!skip_emulated_instruction(&svm->vcpu)) + return 0; + } if (int_type != SVM_EXITINTINFO_TYPE_SOFT) int_vec = -1; - if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, - has_error_code, error_code) == EMULATE_FAIL) { - svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - svm->vcpu.run->internal.ndata = 0; - return 0; - } - return 1; + return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + has_error_code, error_code); } static int cpuid_interception(struct vcpu_svm *svm) { - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; return kvm_emulate_cpuid(&svm->vcpu); } @@ -3933,7 +3927,7 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); return kvm_skip_emulated_instruction(&svm->vcpu); @@ -3941,13 +3935,12 @@ static int invlpg_interception(struct vcpu_svm *svm) static int emulate_on_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(&svm->vcpu, 0); } static int rsm_interception(struct vcpu_svm *svm) { - return kvm_emulate_instruction_from_buffer(&svm->vcpu, - rsm_ins_bytes, 2) == EMULATE_DONE; + return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2); } static int rdpmc_interception(struct vcpu_svm *svm) @@ -4232,23 +4225,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int rdmsr_interception(struct vcpu_svm *svm) { - u32 ecx = kvm_rcx_read(&svm->vcpu); - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (svm_get_msr(&svm->vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } else { - trace_kvm_msr_read(ecx, msr_info.data); - - kvm_rax_write(&svm->vcpu, msr_info.data & 0xffffffff); - kvm_rdx_write(&svm->vcpu, msr_info.data >> 32); - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - return kvm_skip_emulated_instruction(&svm->vcpu); - } + return kvm_emulate_rdmsr(&svm->vcpu); } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -4438,23 +4415,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) static int wrmsr_interception(struct vcpu_svm *svm) { - struct msr_data msr; - u32 ecx = kvm_rcx_read(&svm->vcpu); - u64 data = kvm_read_edx_eax(&svm->vcpu); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - - svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (kvm_set_msr(&svm->vcpu, &msr)) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(&svm->vcpu, 0); - return 1; - } else { - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(&svm->vcpu); - } + return kvm_emulate_wrmsr(&svm->vcpu); } static int msr_interception(struct vcpu_svm *svm) @@ -4636,6 +4597,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); if (ldr == svm->ldr_reg) return 0; @@ -4643,7 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); if (ldr) - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + ret = avic_ldr_write(vcpu, id, ldr); if (!ret) svm->ldr_reg = ldr; @@ -4655,8 +4617,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) { u64 *old, *new; struct vcpu_svm *svm = to_svm(vcpu); - u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); - u32 id = (apic_id_reg >> 24) & 0xff; + u32 id = kvm_xapic_id(vcpu->arch.apic); if (vcpu->vcpu_id == id) return 0; @@ -4768,7 +4729,7 @@ static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) ret = avic_unaccel_trap_write(svm); } else { /* Handling Fault */ - ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + ret = kvm_emulate_instruction(&svm->vcpu, 0); } return ret; @@ -4835,6 +4796,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MONITOR] = monitor_interception, [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, + [SVM_EXIT_RDPRU] = rdpru_interception, [SVM_EXIT_NPF] = npf_interception, [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, @@ -5025,9 +4987,14 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); + dump_vmcb(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_code; + return 0; } return svm_exit_handlers[exit_code](svm); @@ -5274,7 +5241,8 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", __func__, irq.vector); return -1; @@ -5328,6 +5296,7 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * 1. When cannot target interrupt to a specific vcpu. * 2. Unsetting posted interrupt. * 3. APIC virtialization is disabled for the vcpu. + * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) */ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && kvm_vcpu_apicv_active(&svm->vcpu)) { @@ -5933,6 +5902,8 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); } +#define F(x) bit(X86_FEATURE_##x) + static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { @@ -5944,6 +5915,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ break; + case 0x80000008: + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + entry->ebx |= F(VIRT_SSBD); + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper @@ -5954,11 +5930,11 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) /* Support next_rip if host supports it */ if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= SVM_FEATURE_NRIP; + entry->edx |= F(NRIPS); /* Support NPT for the guest if enabled */ if (npt_enabled) - entry->edx |= SVM_FEATURE_NPT; + entry->edx |= F(NPT); break; case 0x8000001F: @@ -6067,6 +6043,7 @@ static const struct __x86_intercept { [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), }; #undef PRE_EX @@ -7128,13 +7105,6 @@ failed: return ret; } -static int nested_enable_evmcs(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version) -{ - /* Intel-only feature */ - return -ENODEV; -} - static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { unsigned long cr4 = kvm_read_cr4(vcpu); @@ -7193,6 +7163,21 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) return false; } +static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * TODO: Last condition latch INIT signals on vCPU when + * vCPU is in guest-mode and vmcb12 defines intercept on INIT. + * To properly emulate the INIT intercept, SVM should implement + * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit() + * there if an INIT signal is pending. + */ + return !gif_set(svm) || + (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7325,10 +7310,12 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = nested_enable_evmcs, + .nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + + .apic_init_signal_blocked = svm_apic_init_signal_blocked, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b5c831e79094..7c741a0c5f80 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -232,17 +232,20 @@ TRACE_EVENT(kvm_exit, __field( u32, isa ) __field( u64, info1 ) __field( u64, info2 ) + __field( unsigned int, vcpu_id ) ), TP_fast_assign( __entry->exit_reason = exit_reason; __entry->guest_rip = kvm_rip_read(vcpu); __entry->isa = isa; + __entry->vcpu_id = vcpu->vcpu_id; kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, &__entry->info2); ), - TP_printk("reason %s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", + __entry->vcpu_id, (__entry->isa == KVM_ISA_VMX) ? __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), @@ -887,36 +890,27 @@ TRACE_EVENT(kvm_pml_full, TP_printk("vcpu %d: PML full", __entry->vcpu_id) ); -TRACE_EVENT(kvm_ple_window, - TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), - TP_ARGS(grow, vcpu_id, new, old), +TRACE_EVENT(kvm_ple_window_update, + TP_PROTO(unsigned int vcpu_id, unsigned int new, unsigned int old), + TP_ARGS(vcpu_id, new, old), TP_STRUCT__entry( - __field( bool, grow ) __field( unsigned int, vcpu_id ) - __field( int, new ) - __field( int, old ) + __field( unsigned int, new ) + __field( unsigned int, old ) ), TP_fast_assign( - __entry->grow = grow; __entry->vcpu_id = vcpu_id; __entry->new = new; __entry->old = old; ), - TP_printk("vcpu %u: ple_window %d (%s %d)", - __entry->vcpu_id, - __entry->new, - __entry->grow ? "grow" : "shrink", - __entry->old) + TP_printk("vcpu %u old %u new %u (%s)", + __entry->vcpu_id, __entry->old, __entry->new, + __entry->old < __entry->new ? "growed" : "shrinked") ); -#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ - trace_kvm_ple_window(true, vcpu_id, new, old) -#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ - trace_kvm_ple_window(false, vcpu_id, new, old) - TRACE_EVENT(kvm_pvclock_update, TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock), TP_ARGS(vcpu_id, pvclock), @@ -1320,7 +1314,7 @@ TRACE_EVENT(kvm_avic_incomplete_ipi, __entry->index = index; ), - TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u", __entry->vcpu, __entry->icrh, __entry->icrl, __entry->id, __entry->index) ); @@ -1345,7 +1339,7 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec = vec; ), - TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x", __entry->vcpu, __entry->offset, __print_symbolic(__entry->offset, kvm_trace_symbol_apic), @@ -1462,6 +1456,46 @@ TRACE_EVENT(kvm_hv_send_ipi_ex, __entry->vector, __entry->format, __entry->valid_bank_mask) ); + +TRACE_EVENT(kvm_pv_tlb_flush, + TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb), + TP_ARGS(vcpu_id, need_flush_tlb), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( bool, need_flush_tlb ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->need_flush_tlb = need_flush_tlb; + ), + + TP_printk("vcpu %u need_flush_tlb %s", __entry->vcpu_id, + __entry->need_flush_tlb ? "true" : "false") +); + +/* + * Tracepoint for failed nested VMX VM-Enter. + */ +TRACE_EVENT(kvm_nested_vmenter_failed, + TP_PROTO(const char *msg, u32 err), + TP_ARGS(msg, err), + + TP_STRUCT__entry( + __field(const char *, msg) + __field(u32, err) + ), + + TP_fast_assign( + __entry->msg = msg; + __entry->err = err; + ), + + TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index d6664ee3d127..7aa69716d516 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -247,6 +247,12 @@ static inline bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static inline bool vmx_waitpkg_supported(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + static inline bool cpu_has_vmx_tsc_scaling(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 39a24eec8884..07ebf6882a45 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -178,6 +178,8 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) + vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; vp_ap->enlighten_vmentry = 1; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ced9fba32598..0e7c9301fe86 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -19,6 +19,14 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); static bool __read_mostly nested_early_check = 0; module_param(nested_early_check, bool, S_IRUGO); +#define CC(consistency_check) \ +({ \ + bool failed = (consistency_check); \ + if (failed) \ + trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ + failed; \ +}) + /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -190,6 +198,16 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } +static inline bool vmx_control_verify(u32 control, u32 low, u32 high) +{ + return fixed_bits_valid(control, low, high); +} + +static inline u64 vmx_control_msr(u32 low, u32 high) +{ + return low | ((u64)high << 32); +} + static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); @@ -430,8 +448,8 @@ static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return 0; - if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || - !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || + CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) return -EINVAL; return 0; @@ -443,7 +461,7 @@ static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) + if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) return -EINVAL; return 0; @@ -455,7 +473,7 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return 0; - if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) + if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; return 0; @@ -688,7 +706,7 @@ static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !page_address_valid(vcpu, vmcs12->apic_access_addr)) + CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) return -EINVAL; else return 0; @@ -707,16 +725,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * If virtualize x2apic mode is enabled, * virtualize apic access must be disabled. */ - if (nested_cpu_has_virt_x2apic_mode(vmcs12) && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) + if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) return -EINVAL; /* * If virtual interrupt delivery is enabled, * we must exit on external interrupts. */ - if (nested_cpu_has_vid(vmcs12) && - !nested_exit_on_intr(vcpu)) + if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) return -EINVAL; /* @@ -727,15 +744,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && - (!nested_cpu_has_vid(vmcs12) || - !nested_exit_intr_ack_set(vcpu) || - (vmcs12->posted_intr_nv & 0xff00) || - (vmcs12->posted_intr_desc_addr & 0x3f) || - (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) + (CC(!nested_cpu_has_vid(vmcs12)) || + CC(!nested_exit_intr_ack_set(vcpu)) || + CC((vmcs12->posted_intr_nv & 0xff00)) || + CC((vmcs12->posted_intr_desc_addr & 0x3f)) || + CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ - if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) return -EINVAL; return 0; @@ -759,10 +776,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, - vmcs12->vm_exit_msr_load_addr) || - nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, - vmcs12->vm_exit_msr_store_addr)) + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_load_count, + vmcs12->vm_exit_msr_load_addr)) || + CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_exit_msr_store_count, + vmcs12->vm_exit_msr_store_addr))) return -EINVAL; return 0; @@ -771,8 +790,9 @@ static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, - vmcs12->vm_entry_msr_load_addr)) + if (CC(nested_vmx_check_msr_switch(vcpu, + vmcs12->vm_entry_msr_load_count, + vmcs12->vm_entry_msr_load_addr))) return -EINVAL; return 0; @@ -784,8 +804,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has_pml(vmcs12)) return 0; - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->pml_address)) + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->pml_address))) return -EINVAL; return 0; @@ -794,8 +814,8 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && - !nested_cpu_has_ept(vmcs12)) + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && + !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } @@ -803,8 +823,8 @@ static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && - !nested_cpu_has_ept(vmcs12)) + if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && + !nested_cpu_has_ept(vmcs12))) return -EINVAL; return 0; } @@ -815,8 +835,8 @@ static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, if (!nested_cpu_has_shadow_vmcs(vmcs12)) return 0; - if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || - !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || + CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) return -EINVAL; return 0; @@ -826,12 +846,12 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ - if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) + if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) return -EINVAL; - if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ - e->index == MSR_IA32_UCODE_REV) + if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ + CC(e->index == MSR_IA32_UCODE_REV)) return -EINVAL; - if (e->reserved != 0) + if (CC(e->reserved != 0)) return -EINVAL; return 0; } @@ -839,9 +859,9 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { - if (e->index == MSR_FS_BASE || - e->index == MSR_GS_BASE || - e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ + if (CC(e->index == MSR_FS_BASE) || + CC(e->index == MSR_GS_BASE) || + CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; @@ -850,24 +870,40 @@ static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { - if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ + if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ nested_vmx_msr_check_common(vcpu, e)) return -EINVAL; return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. + * + * One of the failure modes for MSR load/store is when a list exceeds the + * virtual hardware's capacity. To maintain compatibility with hardware inasmuch + * as possible, process all valid entries before failing rather than precheck + * for a capacity violation. */ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u32 i; struct vmx_msr_entry e; - struct msr_data msr; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); - msr.host_initiated = false; for (i = 0; i < count; i++) { + if (unlikely(i >= max_msr_list_size)) + goto fail; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { pr_debug_ratelimited( @@ -881,9 +917,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - msr.index = e.index; - msr.data = e.value; - if (kvm_set_msr(vcpu, &msr)) { + if (kvm_set_msr(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -897,11 +931,15 @@ fail: static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { + u64 data; u32 i; struct vmx_msr_entry e; + u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - struct msr_data msr_info; + if (unlikely(i >= max_msr_list_size)) + return -EINVAL; + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { @@ -916,9 +954,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); return -EINVAL; } - msr_info.host_initiated = false; - msr_info.index = e.index; - if (kvm_get_msr(vcpu, &msr_info)) { + if (kvm_get_msr(vcpu, e.index, &data)) { pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); @@ -927,10 +963,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), - &msr_info.data, sizeof(msr_info.data))) { + &data, sizeof(data))) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, msr_info.data); + __func__, i, e.index, data); return -EINVAL; } } @@ -955,7 +991,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne u32 *entry_failure_code) { if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { - if (!nested_cr3_valid(vcpu, cr3)) { + if (CC(!nested_cr3_valid(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -965,7 +1001,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * must not be dereferenced. */ if (is_pae_paging(vcpu) && !nested_ept) { - if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; } @@ -1009,17 +1045,6 @@ static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } - -static inline bool vmx_control_verify(u32 control, u32 low, u32 high) -{ - return fixed_bits_valid(control, low, high); -} - -static inline u64 vmx_control_msr(u32 low, u32 high) -{ - return low | ((u64)high << 32); -} - static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) { superset &= mask; @@ -2085,6 +2110,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC); @@ -2411,12 +2437,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) { - if (!nested_cpu_has_nmi_exiting(vmcs12) && - nested_cpu_has_virtual_nmis(vmcs12)) + if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && + nested_cpu_has_virtual_nmis(vmcs12))) return -EINVAL; - if (!nested_cpu_has_virtual_nmis(vmcs12) && - nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) + if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && + nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) return -EINVAL; return 0; @@ -2430,11 +2456,11 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) /* Check for memory type validity */ switch (address & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; break; case VMX_EPTP_MT_WB: - if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) return false; break; default: @@ -2442,16 +2468,16 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) } /* only 4 levels page-walk length are valid */ - if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) + if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) return false; /* Reserved bits should not be set */ - if (address >> maxphyaddr || ((address >> 7) & 0x1f)) + if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ if (address & VMX_EPTP_AD_ENABLE_BIT) { - if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } @@ -2466,21 +2492,21 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, - vmx->nested.msrs.pinbased_ctls_low, - vmx->nested.msrs.pinbased_ctls_high) || - !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.msrs.procbased_ctls_low, - vmx->nested.msrs.procbased_ctls_high)) + if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, + vmx->nested.msrs.pinbased_ctls_low, + vmx->nested.msrs.pinbased_ctls_high)) || + CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, + vmx->nested.msrs.procbased_ctls_low, + vmx->nested.msrs.procbased_ctls_high))) return -EINVAL; if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.msrs.secondary_ctls_low, - vmx->nested.msrs.secondary_ctls_high)) + CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.msrs.secondary_ctls_low, + vmx->nested.msrs.secondary_ctls_high))) return -EINVAL; - if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || + if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || @@ -2491,7 +2517,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || - (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) + CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) return -EINVAL; if (!nested_cpu_has_preemption_timer(vmcs12) && @@ -2499,17 +2525,17 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_cpu_has_ept(vmcs12) && - !valid_ept_address(vcpu, vmcs12->ept_pointer)) + CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { - if (vmcs12->vm_function_control & - ~vmx->nested.msrs.vmfunc_controls) + if (CC(vmcs12->vm_function_control & + ~vmx->nested.msrs.vmfunc_controls)) return -EINVAL; if (nested_cpu_has_eptp_switching(vmcs12)) { - if (!nested_cpu_has_ept(vmcs12) || - !page_address_valid(vcpu, vmcs12->eptp_list_address)) + if (CC(!nested_cpu_has_ept(vmcs12)) || + CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) return -EINVAL; } } @@ -2525,10 +2551,10 @@ static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.msrs.exit_ctls_low, - vmx->nested.msrs.exit_ctls_high) || - nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) + if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, + vmx->nested.msrs.exit_ctls_low, + vmx->nested.msrs.exit_ctls_high)) || + CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) return -EINVAL; return 0; @@ -2542,9 +2568,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.msrs.entry_ctls_low, - vmx->nested.msrs.entry_ctls_high)) + if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, + vmx->nested.msrs.entry_ctls_low, + vmx->nested.msrs.entry_ctls_high))) return -EINVAL; /* @@ -2564,31 +2590,31 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; /* VM-entry interruption-info field: interruption type */ - if (intr_type == INTR_TYPE_RESERVED || - (intr_type == INTR_TYPE_OTHER_EVENT && - !nested_cpu_supports_monitor_trap_flag(vcpu))) + if (CC(intr_type == INTR_TYPE_RESERVED) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) return -EINVAL; /* VM-entry interruption-info field: vector */ - if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || - (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || - (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; /* VM-entry interruption-info field: deliver error code */ should_have_error_code = intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && x86_exception_has_error_code(vector); - if (has_error_code != should_have_error_code) + if (CC(has_error_code != should_have_error_code)) return -EINVAL; /* VM-entry exception error code */ - if (has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) + if (CC(has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ - if (intr_info & INTR_INFO_RESVD_BITS_MASK) + if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) return -EINVAL; /* VM-entry instruction length */ @@ -2596,9 +2622,9 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if ((vmcs12->vm_entry_instruction_len > 15) || - (vmcs12->vm_entry_instruction_len == 0 && - !nested_cpu_has_zero_length_injection(vcpu))) + if (CC(vmcs12->vm_entry_instruction_len > 15) || + CC(vmcs12->vm_entry_instruction_len == 0 && + CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; } } @@ -2625,40 +2651,56 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, { bool ia32e; - if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || - !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || - !nested_cr3_valid(vcpu, vmcs12->host_cr3)) + if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || + CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || + CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || - is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) + if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && - !kvm_pat_valid(vmcs12->host_ia32_pat)) + CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; - ia32e = (vmcs12->vm_exit_controls & - VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; - - if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || - vmcs12->host_cs_selector == 0 || - vmcs12->host_tr_selector == 0 || - (vmcs12->host_ss_selector == 0 && !ia32e)) +#ifdef CONFIG_X86_64 + ia32e = !!(vcpu->arch.efer & EFER_LMA); +#else + ia32e = false; +#endif + + if (ia32e) { + if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || + CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) + return -EINVAL; + } else { + if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || + CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || + CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || + CC((vmcs12->host_rip) >> 32)) + return -EINVAL; + } + + if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || + CC(vmcs12->host_cs_selector == 0) || + CC(vmcs12->host_tr_selector == 0) || + CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; #ifdef CONFIG_X86_64 - if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || - is_noncanonical_address(vmcs12->host_gs_base, vcpu) || - is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || - is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || - is_noncanonical_address(vmcs12->host_tr_base, vcpu)) + if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || + CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) return -EINVAL; #endif @@ -2669,9 +2711,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, * the host address-space size VM-exit control. */ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { - if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || - ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || + CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) return -EINVAL; } @@ -2688,16 +2730,16 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (vmcs12->vmcs_link_pointer == -1ull) return 0; - if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) return -EINVAL; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) + if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) return -EINVAL; shadow = map.hva; - if (shadow->hdr.revision_id != VMCS12_REVISION || - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || + CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) r = -EINVAL; kvm_vcpu_unmap(vcpu, &map, false); @@ -2709,8 +2751,8 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, */ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) { - if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && - vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) + if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && + vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) return -EINVAL; return 0; @@ -2724,12 +2766,12 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, *exit_qual = ENTRY_FAIL_DEFAULT; - if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || - !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) + if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || + CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && - !kvm_pat_valid(vmcs12->guest_ia32_pat)) + CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { @@ -2749,16 +2791,16 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; - if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || - ((vmcs12->guest_cr0 & X86_CR0_PG) && - ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) + if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || + CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || + CC(((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) return -EINVAL; } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || - (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; if (nested_check_guest_non_reg_state(vmcs12)) @@ -2841,9 +2883,13 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); if (vm_fail) { + u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); + preempt_enable(); - WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + + trace_kvm_nested_vmenter_failed( + "early hardware check VM-instruction error: ", error); + WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -2871,7 +2917,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2891,19 +2937,18 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) vmx->nested.apic_access_page = NULL; } page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ if (!is_error_page(page)) { vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; } } @@ -2948,6 +2993,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + return true; } /* @@ -2986,13 +3032,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail + * + * Returns: + * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_ENTRY_VMFAIL: Consistency check VMFail + * NVMX_ENTRY_VMEXIT: Consistency check VMExit + * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error */ -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3035,11 +3083,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); + if (unlikely(!nested_get_vmcs12_pages(vcpu))) + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; + return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) @@ -3103,7 +3152,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ - return 0; + return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's @@ -3119,14 +3168,14 @@ vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) - return 1; + return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; - return 1; + return NVMX_VMENTRY_VMEXIT; } /* @@ -3136,9 +3185,9 @@ vmentry_fail_vmexit: static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3198,13 +3247,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3235,6 +3280,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -3401,6 +3455,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) unsigned long exit_qual; bool block_nested_events = vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + + if (lapic_in_kernel(vcpu) && + test_bit(KVM_APIC_INIT, &apic->pending_events)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + return 0; + } if (vcpu->arch.exception.pending && nested_vmx_check_exception(vcpu, &exit_qual)) { @@ -3889,7 +3952,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_msr_entry g, h; - struct msr_data msr; gpa_t gpa; u32 i, j; @@ -3949,7 +4011,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) * from the guest value. The intent is to stuff host state as * silently as possible, not to fully process the exit load list. */ - msr.host_initiated = false; for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { @@ -3979,9 +4040,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - msr.index = h.index; - msr.data = h.value; - if (kvm_set_msr(vcpu, &msr)) { + if (kvm_set_msr(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -4466,7 +4525,12 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; + free_nested(vcpu); + + /* Process a latched INIT during time CPU was in VMX operation */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + return nested_vmx_succeed(vcpu); } @@ -4540,6 +4604,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) int len; gva_t gva = 0; struct vmcs12 *vmcs12; + struct x86_exception e; short offset; if (!nested_vmx_check_permission(vcpu)) @@ -4588,7 +4653,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) vmx_instruction_info, true, len, &gva)) return 1; /* _system ok, nested_vmx_check_permission has verified cpl=0 */ - kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); + if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) + kvm_inject_page_fault(vcpu, &e); } return nested_vmx_succeed(vcpu); @@ -5259,8 +5325,9 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); + trace_kvm_nested_vmenter_failed( + "hardware VM-instruction error: ", + vmcs_read32(VM_INSTRUCTION_ERROR)); return true; } @@ -5420,6 +5487,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_ENCLS: /* SGX is never exposed to L1 */ return false; + case EXIT_REASON_UMWAIT: + case EXIT_REASON_TPAUSE: + return nested_cpu_has2(vmcs12, + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); default: return true; } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..6280f33e5fa6 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -6,6 +6,16 @@ #include "vmcs12.h" #include "vmx.h" +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); @@ -13,7 +23,8 @@ void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_vcpu_setup(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 2200fb698dd0..45eaedee2ac0 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -11,8 +11,13 @@ #include "vmcs.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) -#define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) + +asmlinkage void vmread_error(unsigned long field, bool fault); +void vmwrite_error(unsigned long field, unsigned long value); +void vmclear_error(struct vmcs *vmcs, u64 phys_addr); +void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); +void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); +void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); static __always_inline void vmcs_check16(unsigned long field) { @@ -62,8 +67,22 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (__ex_clear("vmread %1, %0", "%k0") - : "=r"(value) : "r"(field)); + asm volatile("1: vmread %2, %1\n\t" + ".byte 0x3e\n\t" /* branch taken hint */ + "ja 3f\n\t" + "mov %2, %%" _ASM_ARG1 "\n\t" + "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t" + "2: call vmread_error\n\t" + "xor %k1, %k1\n\t" + "3:\n\t" + + ".pushsection .fixup, \"ax\"\n\t" + "4: mov %2, %%" _ASM_ARG1 "\n\t" + "mov $1, %%" _ASM_ARG2 "\n\t" + "jmp 2b\n\t" + ".popsection\n\t" + _ASM_EXTABLE(1b, 4b) + : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc"); return value; } @@ -103,21 +122,39 @@ static __always_inline unsigned long vmcs_readl(unsigned long field) return __vmcs_readl(field); } -static noinline void vmwrite_error(unsigned long field, unsigned long value) -{ - printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", - field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); - dump_stack(); -} +#define vmx_asm1(insn, op1, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) + +#define vmx_asm2(insn, op1, op2, error_args...) \ +do { \ + asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \ + ".byte 0x2e\n\t" /* branch not taken hint */ \ + "jna %l[error]\n\t" \ + _ASM_EXTABLE(1b, %l[fault]) \ + : : op1, op2 : "cc" : error, fault); \ + return; \ +error: \ + insn##_error(error_args); \ + return; \ +fault: \ + kvm_spurious_fault(); \ +} while (0) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - bool error; - - asm volatile (__ex("vmwrite %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(field), "rm"(value)); - if (unlikely(error)) - vmwrite_error(field, value); + vmx_asm2(vmwrite, "r"(field), "rm"(value), field, value); } static __always_inline void vmcs_write16(unsigned long field, u16 value) @@ -182,28 +219,18 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) static inline void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; - asm volatile (__ex("vmclear %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", - vmcs, phys_addr); + vmx_asm1(vmclear, "m"(phys_addr), vmcs, phys_addr); } static inline void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex("vmptrld %1") CC_SET(na) - : CC_OUT(na) (error) : "m"(phys_addr)); - if (unlikely(error)) - printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", - vmcs, phys_addr); + vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); } static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) @@ -213,11 +240,8 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; - bool error; - asm volatile (__ex("invvpid %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) @@ -225,11 +249,8 @@ static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; - bool error; - asm volatile (__ex("invept %2, %1") CC_SET(na) - : CC_OUT(na) (error) : "r"(ext), "m"(operand)); - BUG_ON(error); + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); } static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..3e9c059099e9 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 4010d519eb8c..751a384c2eb0 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -94,7 +94,7 @@ ENDPROC(vmx_vmexit) /** * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode - * @vmx: struct vcpu_vmx * + * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) * @regs: unsigned long * (to guest registers) * @launched: %true if the VMCS has been launched * @@ -151,7 +151,7 @@ ENTRY(__vmx_vcpu_run) mov VCPU_R14(%_ASM_AX), %r14 mov VCPU_R15(%_ASM_AX), %r15 #endif - /* Load guest RAX. This kills the vmx_vcpu pointer! */ + /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX /* Enter guest mode */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c030c96fc81a..04a8212704c1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -343,6 +348,48 @@ static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bit void vmx_vmexit(void); +#define vmx_insn_failed(fmt...) \ +do { \ + WARN_ONCE(1, fmt); \ + pr_warn_ratelimited(fmt); \ +} while (0) + +asmlinkage void vmread_error(unsigned long field, bool fault) +{ + if (fault) + kvm_spurious_fault(); + else + vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); +} + +noinline void vmwrite_error(unsigned long field, unsigned long value) +{ + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", + field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); +} + +noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) +{ + vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); +} + +noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) +{ + vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", + ext, vpid, gva); +} + +noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +{ + vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", + ext, eptp, gpa); +} + static DEFINE_PER_CPU(struct vmcs *, vmxarea); DEFINE_PER_CPU(struct vmcs *, current_vmcs); /* @@ -486,6 +533,31 @@ static int hv_remote_flush_tlb(struct kvm *kvm) return hv_remote_flush_tlb_with_range(kvm, NULL); } +static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightened_vmcs *evmcs; + struct hv_partition_assist_pg **p_hv_pa_pg = + &vcpu->kvm->arch.hyperv.hv_pa_pg; + /* + * Synthetic VM-Exit is not enabled in current code and so All + * evmcs in singe VM shares same assist page. + */ + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (!*p_hv_pa_pg) + return -ENOMEM; + + evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; + + evmcs->partition_assist_page = + __pa(*p_hv_pa_pg); + evmcs->hv_vm_id = (unsigned long)vcpu->kvm; + evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; + + return 0; +} + #endif /* IS_ENABLED(CONFIG_HYPERV) */ /* @@ -897,17 +969,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -1204,6 +1268,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1219,6 +1295,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); +after_clear_sn: + /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the @@ -1227,7 +1305,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ smp_mb__after_atomic(); - if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } @@ -1472,17 +1550,32 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } - -static void skip_emulated_instruction(struct kvm_vcpu *vcpu) +static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rip; - rip = kvm_rip_read(vcpu); - rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - kvm_rip_write(vcpu, rip); + /* + * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on + * undefined behavior: Intel's SDM doesn't mandate the VMCS field be + * set when EPT misconfig occurs. In practice, real hardware updates + * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors + * (namely Hyper-V) don't set it due to it being undefined behavior, + * i.e. we end up advancing IP with some random value. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || + to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { + rip = kvm_rip_read(vcpu); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + kvm_rip_write(vcpu, rip); + } else { + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + return 0; + } /* skipping an emulated instruction also counts */ vmx_set_interrupt_shadow(vcpu, 0); + + return 1; } static void vmx_clear_hlt(struct kvm_vcpu *vcpu) @@ -1517,8 +1610,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) int inc_eip = 0; if (kvm_exception_is_soft(nr)) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); return; } @@ -1690,6 +1782,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + msr_info->data = vmx->msr_ia32_umwait_control; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -1863,6 +1961,16 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; + case MSR_IA32_UMWAIT_CONTROL: + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) + return 1; + + /* The reserved bit 1 and non-32 bit [63:32] should be zero */ + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) + return 1; + + vmx->msr_ia32_umwait_control = data; + break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2280,6 +2388,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_PML | SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX | SECONDARY_EXEC_ENABLE_VMFUNC | @@ -4016,6 +4125,23 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } + if (vmx_waitpkg_supported()) { + bool waitpkg_enabled = + guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); + + if (!waitpkg_enabled) + exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + + if (nested) { + if (waitpkg_enabled) + vmx->nested.msrs.secondary_ctls_high |= + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + else + vmx->nested.msrs.secondary_ctls_high &= + ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; + } + } + vmx->secondary_exec_control = exec_control; } @@ -4026,7 +4152,7 @@ static void ept_set_mmio_spte_mask(void) * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, - VMX_EPT_MISCONFIG_WX_VALUE); + VMX_EPT_MISCONFIG_WX_VALUE, 0); } #define VMX_XSS_EXIT_BITMAP 0 @@ -4150,8 +4276,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; + vmx->msr_ia32_umwait_control = 0; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); + vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); if (!init_event) { @@ -4266,8 +4395,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) int inc_eip = 0; if (vcpu->arch.interrupt.soft) inc_eip = vcpu->arch.event_exit_inst_len; - if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -4303,8 +4431,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) - kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } @@ -4431,7 +4558,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0)) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -4482,7 +4609,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; - enum emulation_result er; vect_info = vmx->idt_vectoring_info; intr_info = vmx->exit_intr_info; @@ -4499,13 +4625,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); - if (er == EMULATE_USER_EXIT) - return 0; - else if (er != EMULATE_DONE) + + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero + * error code on #GP. + */ + if (error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); - return 1; + return 1; + } + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); } /* @@ -4547,7 +4677,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) - skip_emulated_instruction(vcpu); + WARN_ON(!skip_emulated_instruction(vcpu)); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -4602,7 +4732,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -4676,7 +4806,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_cr(struct kvm_vcpu *vcpu) @@ -4856,41 +4986,12 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); - struct msr_data msr_info; - - msr_info.index = ecx; - msr_info.host_initiated = false; - if (vmx_get_msr(vcpu, &msr_info)) { - trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_read(ecx, msr_info.data); - - kvm_rax_write(vcpu, msr_info.data & -1u); - kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u); - return kvm_skip_emulated_instruction(vcpu); + return kvm_emulate_rdmsr(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) { - struct msr_data msr; - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - - msr.data = data; - msr.index = ecx; - msr.host_initiated = false; - if (kvm_set_msr(vcpu, &msr) != 0) { - trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; - } - - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); + return kvm_emulate_wrmsr(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -4921,7 +5022,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -4955,20 +5056,6 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) return 1; } -static int handle_xsaves(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - -static int handle_xrstors(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN(1, "this should never happen\n"); - return 1; -} - static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { @@ -4988,7 +5075,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0); } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -5057,23 +5144,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && type != INTR_TYPE_EXT_INTR && type != INTR_TYPE_NMI_INTR)) - skip_emulated_instruction(vcpu); - - if (kvm_task_switch(vcpu, tss_selector, - type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, - has_error_code, error_code) == EMULATE_FAIL) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + WARN_ON(!skip_emulated_instruction(vcpu)); /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? */ - - return 1; + return kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, + reason, has_error_code, error_code); } static int handle_ept_violation(struct kvm_vcpu *vcpu) @@ -5132,21 +5211,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - /* - * Doing kvm_skip_emulated_instruction() depends on undefined - * behavior: Intel's manual doesn't mandate - * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG - * occurs and while on real hardware it was observed to be set, - * other hypervisors (namely Hyper-V) don't set it, we end up - * advancing IP with some random value. Disable fast mmio when - * running nested and keep it for real hardware in hope that - * VM_EXIT_INSTRUCTION_LEN will always be set correctly. - */ - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) - return kvm_skip_emulated_instruction(vcpu); - else - return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == - EMULATE_DONE; + return kvm_skip_emulated_instruction(vcpu); } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -5165,8 +5230,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - enum emulation_result err = EMULATE_DONE; - int ret = 1; bool intr_window_requested; unsigned count = 130; @@ -5187,71 +5250,67 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = kvm_emulate_instruction(vcpu, 0); - - if (err == EMULATE_USER_EXIT) { - ++vcpu->stat.mmio_exits; - ret = 0; - goto out; - } - - if (err != EMULATE_DONE) - goto emulation_error; + if (!kvm_emulate_instruction(vcpu, 0)) + return 0; if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) - goto emulation_error; + vcpu->arch.exception.pending) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; - ret = kvm_vcpu_halt(vcpu); - goto out; + return kvm_vcpu_halt(vcpu); } + /* + * Note, return 1 and not 0, vcpu_run() is responsible for + * morphing the pending signal into the proper return code. + */ if (signal_pending(current)) - goto out; + return 1; + if (need_resched()) schedule(); } -out: - return ret; - -emulation_error: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; + return 1; } static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; + unsigned int old = vmx->ple_window; vmx->ple_window = __grow_ple_window(old, ple_window, ple_window_grow, ple_window_max); - if (vmx->ple_window != old) + if (vmx->ple_window != old) { vmx->ple_window_dirty = true; - - trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } } static void shrink_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int old = vmx->ple_window; + unsigned int old = vmx->ple_window; vmx->ple_window = __shrink_ple_window(old, ple_window, ple_window_shrink, ple_window); - if (vmx->ple_window != old) + if (vmx->ple_window != old) { vmx->ple_window_dirty = true; - - trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } } /* @@ -5541,8 +5600,6 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_xsaves, - [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, @@ -5887,8 +5944,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + dump_vmcs(); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 1; + vcpu->run->internal.data[0] = exit_reason; + return 0; } } @@ -6089,7 +6151,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); @@ -6119,7 +6181,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { - return pi_test_on(vcpu_to_pi_desc(vcpu)); + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -6373,6 +6438,23 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } +static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx) +{ + u32 host_umwait_control; + + if (!vmx_has_waitpkg(vmx)) + return; + + host_umwait_control = get_umwait_control_msr(); + + if (vmx->msr_ia32_umwait_control != host_umwait_control) + add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL, + vmx->msr_ia32_umwait_control, + host_umwait_control, false); + else + clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL); +} + static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6467,6 +6549,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); + atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) vmx_update_hv_timer(vcpu); @@ -6522,6 +6605,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) current_evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (static_branch_unlikely(&enable_evmcs)) + current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ if (vmx->host_debugctlmsr) update_debugctlmsr(vmx->host_debugctlmsr); @@ -6589,6 +6675,7 @@ static struct kvm *vmx_vm_alloc(void) static void vmx_vm_free(struct kvm *kvm) { + kfree(kvm->arch.hyperv.hv_pa_pg); vfree(to_kvm_vmx(kvm)); } @@ -6615,6 +6702,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) unsigned long *msr_bitmap; int cpu; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); @@ -7369,10 +7459,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * irqbalance to make the interrupts single-CPU. * * We will support full lowest-priority interrupt later. + * + * In addition, we can only inject generic interrupts using + * the PI mechanism, refuse to route others through it. */ kvm_set_msi_irq(kvm, e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || + !kvm_irq_is_postable(&irq)) { /* * Make sure the IRTE is in remapped mode if * we don't handle it in posted mode. @@ -7474,6 +7568,11 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) return false; } +static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.vmxon; +} + static __init int hardware_setup(void) { unsigned long host_bndcfgs; @@ -7799,6 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .nested_enable_evmcs = NULL, .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, }; static void vmx_cleanup_l1d_flush(void) @@ -7835,6 +7935,7 @@ static void vmx_exit(void) if (!vp_ap) continue; + vp_ap->nested_control.features.directhypercall = 0; vp_ap->current_nested_vmcs = 0; vp_ap->enlighten_vmentry = 0; } @@ -7874,6 +7975,11 @@ static int __init vmx_init(void) pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); static_branch_enable(&enable_evmcs); } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) + vmx_x86_ops.enable_direct_tlbflush + = hv_enable_direct_tlbflush; + } else { enlightened_vmcs = false; } @@ -7891,12 +7997,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 82d0bc3a4d52..5a0f34b1e226 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,6 +14,8 @@ extern const u32 vmx_msr_index[]; extern u64 host_efer; +extern u32 get_umwait_control_msr(void); + #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 @@ -211,6 +213,7 @@ struct vcpu_vmx { #endif u64 spec_ctrl; + u32 msr_ia32_umwait_control; u32 secondary_exec_control; @@ -253,7 +256,7 @@ struct vcpu_vmx { struct nested_vmx nested; /* Dynamic PLE window. */ - int ple_window; + unsigned int ple_window; bool ple_window_dirty; bool req_immediate_exit; @@ -352,6 +355,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, @@ -370,6 +378,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -497,6 +511,12 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) +{ + return vmx->secondary_exec_control & + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 290c3c3efb87..5d530521f11d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -212,7 +212,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -360,7 +361,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -674,8 +675,14 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, data, offset, len, access); } +static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) +{ + return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | + rsvd_bits(1, 2); +} + /* - * Load the pae pdptrs. Return true is they are all valid. + * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { @@ -694,8 +701,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && - (pdpte[i] & - vcpu->arch.mmu->guest_rsvd_check.rsvd_bits_mask[0][2])) { + (pdpte[i] & pdptr_rsvd_bits(vcpu))) { ret = 0; goto out; } @@ -879,34 +885,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -1119,13 +1133,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1140,11 +1156,36 @@ static u32 msrs_to_save[] = { MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1183,7 +1224,7 @@ static u32 emulated_msrs[] = { * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather - * than in msrs_to_save. + * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1202,13 +1243,14 @@ static u32 emulated_msrs[] = { MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1233,6 +1275,7 @@ static u32 msr_based_features[] = { MSR_IA32_ARCH_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; static u64 kvm_get_arch_capabilities(void) @@ -1243,6 +1286,14 @@ static u64 kvm_get_arch_capabilities(void) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us @@ -1254,6 +1305,32 @@ static u64 kvm_get_arch_capabilities(void) if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + data |= ARCH_CAP_RDCL_NO; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + data |= ARCH_CAP_SSB_NO; + if (!boot_cpu_has_bug(X86_BUG_MDS)) + data |= ARCH_CAP_MDS_NO; + + /* + * On TAA affected systems, export MDS_NO=0 when: + * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. + * - Updated microcode is present. This is detected by + * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures + * that VERW clears CPU buffers. + * + * When MDS_NO=0 is exported, guests deploy clear CPU buffer + * mitigation and don't complain: + * + * "Vulnerable: Clear CPU buffers attempted, no microcode" + * + * If TSX is disabled on the system, guests are also mitigated against + * TAA and clear CPU buffer mitigation is not required for guests. + */ + if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && + (data & ARCH_CAP_TSX_CTRL_MSR)) + data &= ~ARCH_CAP_MDS_NO; + return data; } @@ -1351,19 +1428,23 @@ void kvm_enable_efer_bits(u64 mask) EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); /* - * Writes msr value into into the appropriate "register". + * Write @data into the MSR specified by @index. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, + bool host_initiated) { - switch (msr->index) { + struct msr_data msr; + + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(msr->data, vcpu)) + if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1380,38 +1461,95 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); + data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); } - return kvm_x86_ops->set_msr(vcpu, msr); + + msr.data = data; + msr.index = index; + msr.host_initiated = host_initiated; + + return kvm_x86_ops->set_msr(vcpu, &msr); } -EXPORT_SYMBOL_GPL(kvm_set_msr); /* - * Adapt set_msr() to msr_io()'s calling convention + * Read the MSR specified by @index into @data. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. */ -static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; - int r; + int ret; msr.index = index; - msr.host_initiated = true; - r = kvm_get_msr(vcpu, &msr); - if (r) - return r; + msr.host_initiated = host_initiated; - *data = msr.data; - return 0; + ret = kvm_x86_ops->get_msr(vcpu, &msr); + if (!ret) + *data = msr.data; + return ret; } -static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - struct msr_data msr; + return __kvm_get_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_get_msr); - msr.data = *data; - msr.index = index; - msr.host_initiated = true; - return kvm_set_msr(vcpu, &msr); +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_set_msr); + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data; + + if (kvm_get_msr(vcpu, ecx, &data)) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + + if (kvm_set_msr(vcpu, ecx, data)) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return __kvm_set_msr(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -2431,6 +2569,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) @@ -2452,6 +2591,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb(vcpu, false); @@ -2594,8 +2735,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; - kvmclock_reset(vcpu); - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { bool tmp = (msr == MSR_KVM_SYSTEM_TIME); @@ -2609,14 +2748,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else vcpu->arch.pv_time_enabled = true; break; @@ -2748,18 +2886,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_msr_common); - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) -{ - return kvm_x86_ops->get_msr(vcpu, msr); -} -EXPORT_SYMBOL_GPL(kvm_get_msr); - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; @@ -3106,7 +3232,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: - case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -3183,6 +3308,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops->get_nested_state ? kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + r = kvm_x86_ops->enable_direct_tlbflush != NULL; + break; + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops->nested_enable_evmcs != NULL; + break; default: break; } @@ -3506,8 +3637,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3957,6 +4087,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = -EFAULT; } return r; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + if (!kvm_x86_ops->enable_direct_tlbflush) + return -ENOTTY; + + return kvm_x86_ops->enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -4986,18 +5121,28 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_saved_all[]"); + + perf_get_x86_pmu_capability(&x86_pmu); + + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; @@ -5025,43 +5170,43 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { if (!kvm_x86_ops->pt_supported() || - msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -5312,6 +5457,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; + /* + * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED + * is returned, but our callers are not ready for that and they blindly + * call kvm_inject_page_fault. Ensure that they at least do not leak + * uninitialized kernel stack memory into cr2 and error code. + */ + memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -5320,7 +5472,6 @@ EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); int handle_ud(struct kvm_vcpu *vcpu) { int emul_type = EMULTYPE_TRAP_UD; - enum emulation_result er; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; @@ -5329,15 +5480,10 @@ int handle_ud(struct kvm_vcpu *vcpu) sig, sizeof(sig), &e) == 0 && memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); - emul_type = 0; + emul_type = EMULTYPE_TRAP_UD_FORCED; } - er = kvm_emulate_instruction(vcpu, emul_type); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + return kvm_emulate_instruction(vcpu, emul_type); } EXPORT_SYMBOL_GPL(handle_ud); @@ -5370,7 +5516,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, 0, access)) { + vcpu->arch.mmio_access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5964,28 +6110,13 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - struct msr_data msr; - int r; - - msr.index = msr_index; - msr.host_initiated = false; - r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); - if (r) - return r; - - *pdata = msr.data; - return 0; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - struct msr_data msr; - - msr.data = data; - msr.index = msr_index; - msr.host_initiated = false; - return kvm_set_msr(emul_to_vcpu(ctxt), &msr); + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -6068,6 +6199,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(emul_to_vcpu(ctxt)); } +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -6109,6 +6245,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, + .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6168,7 +6305,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; @@ -6180,37 +6317,43 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - return EMULATE_DONE; + if (ret != X86EMUL_CONTINUE) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } else { + ctxt->eip = ctxt->_eip; + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); + } } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { - int r = EMULATE_DONE; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_VMWARE_GP) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (emulation_type & EMULTYPE_SKIP) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_USER_EXIT; + return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - return r; + if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + return 1; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, @@ -6365,7 +6508,7 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; @@ -6374,18 +6517,20 @@ static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 0; } + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 1; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - int r = EMULATE_DONE; + int r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops->skip_emulated_instruction(vcpu); + if (unlikely(!r)) + return 0; /* * rflags is the old, "raw" value of the flags. The new value has @@ -6396,8 +6541,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - kvm_vcpu_do_singlestep(vcpu, &r); - return r == EMULATE_DONE; + r = kvm_vcpu_do_singlestep(vcpu); + return r; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); @@ -6416,7 +6561,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; + *r = 0; return true; } } @@ -6432,7 +6577,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.dr6 &= ~DR_TRAP_BITS; vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); - *r = EMULATE_DONE; + *r = 1; return true; } } @@ -6516,32 +6661,48 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_TRAP_UD) || + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; - if (ctxt->have_exception && inject_emulated_exception(vcpu)) - return EMULATE_DONE; - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; + return 1; + if (ctxt->have_exception) { + /* + * #UD should result in just EMULATION_FAILED, and trap-like + * exception should not be encountered during decode. + */ + WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || + exception_type(ctxt->exception.vector) == EXCPT_TRAP); + inject_emulated_exception(vcpu); + return 1; + } return handle_emulation_failure(vcpu, emulation_type); } } - if ((emulation_type & EMULTYPE_VMWARE) && - !is_vmware_backdoor_opcode(ctxt)) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_VMWARE_GP) && + !is_vmware_backdoor_opcode(ctxt)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + /* + * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks + * for kvm_skip_emulated_instruction(). The caller is responsible for + * updating interruptibility state and injecting single-step #DBs. + */ if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); - return EMULATE_DONE; + return 1; } if (retry_instruction(ctxt, cr2, emulation_type)) - return EMULATE_DONE; + return 1; /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ @@ -6557,18 +6718,18 @@ restart: r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; + return 1; if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { - r = EMULATE_DONE; + r = 1; if (inject_emulated_exception(vcpu)) return r; } else if (vcpu->arch.pio.count) { @@ -6579,16 +6740,18 @@ restart: writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_USER_EXIT; + r = 0; } else if (vcpu->mmio_needed) { + ++vcpu->stat.mmio_exits; + if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_USER_EXIT; + r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else - r = EMULATE_DONE; + r = 1; if (writeback) { unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -6597,8 +6760,8 @@ restart: if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); + if (r && ctxt->tf) + r = kvm_vcpu_do_singlestep(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7803,8 +7966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -8192,12 +8359,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return r; } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8565,23 +8731,22 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; + if (ret) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8600,7 +8765,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -9290,6 +9455,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9315,10 +9482,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); + return kvm_x86_ops->vm_init(kvm); +} - return 0; +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -9422,6 +9591,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { @@ -9622,8 +9796,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * Scan sptes if dirty logging has been stopped, dropping those * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() */ - if ((change != KVM_MR_DELETE) && + if (change == KVM_MR_FLAGS_ONLY && (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); @@ -10010,7 +10189,7 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); bool kvm_arch_has_irq_bypass(void) { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -10050,9 +10229,6 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } @@ -10079,11 +10255,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6594020c0691..dbf7442a822b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -196,7 +196,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, * actually a nGPA. */ vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK; - vcpu->arch.access = access; + vcpu->arch.mmio_access = access; vcpu->arch.mmio_gfn = gfn; vcpu->arch.mmio_gen = gen; } @@ -261,7 +261,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) } void kvm_set_pending_timer(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 4fe1601dbc5d..86976b55ae74 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -33,7 +33,7 @@ 102: .section .fixup,"ax" 103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(100b, 103b) @@ -113,7 +113,7 @@ ENTRY(copy_user_generic_unrolled) 40: leal (%rdx,%rcx,8),%edx jmp 60f 50: movl %ecx,%edx -60: jmp copy_user_handle_tail /* ecx is zerorest also */ +60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */ .previous _ASM_EXTABLE_UA(1b, 30b) @@ -177,7 +177,7 @@ ENTRY(copy_user_generic_string) .section .fixup,"ax" 11: leal (%rdx,%rcx,8),%ecx 12: movl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, 11b) @@ -210,7 +210,7 @@ ENTRY(copy_user_enhanced_fast_string) .section .fixup,"ax" 12: movl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, 12b) @@ -231,7 +231,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string) * eax uncopied bytes or 0 if successful. */ ALIGN; -copy_user_handle_tail: +.Lcopy_user_handle_tail: movl %edx,%ecx 1: rep movsb 2: mov %ecx,%eax @@ -239,7 +239,7 @@ copy_user_handle_tail: ret _ASM_EXTABLE_UA(1b, 2b) -END(copy_user_handle_tail) +END(.Lcopy_user_handle_tail) /* * copy_user_nocache - Uncached memory copy with exception handling @@ -364,7 +364,7 @@ ENTRY(__copy_user_nocache) movl %ecx,%edx .L_fixup_handle_tail: sfence - jmp copy_user_handle_tail + jmp .Lcopy_user_handle_tail .previous _ASM_EXTABLE_UA(1b, .L_fixup_4x8b_copy) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index b7375dc6898f..c126571e5e2e 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops) __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* - * AMD, like Intel, supports the EAX hint and EAX=0xf - * means, do not enter any deep C-state and we use it + * AMD, like Intel's MWAIT version, supports the EAX hint and + * EAX=0xf0 means, do not enter any deep C-state and we use it * here in delay() to minimize wakeup latency. */ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 304f958c27b2..9578eb88fc87 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -115,7 +115,7 @@ ENDPROC(__get_user_8) EXPORT_SYMBOL(__get_user_8) -bad_get_user_clac: +.Lbad_get_user_clac: ASM_CLAC bad_get_user: xor %edx,%edx @@ -123,7 +123,7 @@ bad_get_user: ret #ifdef CONFIG_X86_32 -bad_get_user_8_clac: +.Lbad_get_user_8_clac: ASM_CLAC bad_get_user_8: xor %edx,%edx @@ -132,12 +132,12 @@ bad_get_user_8: ret #endif - _ASM_EXTABLE_UA(1b, bad_get_user_clac) - _ASM_EXTABLE_UA(2b, bad_get_user_clac) - _ASM_EXTABLE_UA(3b, bad_get_user_clac) + _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac) + _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac) #ifdef CONFIG_X86_64 - _ASM_EXTABLE_UA(4b, bad_get_user_clac) + _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac) #else - _ASM_EXTABLE_UA(4b, bad_get_user_8_clac) - _ASM_EXTABLE_UA(5b, bad_get_user_8_clac) + _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac) + _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac) #endif diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 14bf78341d3c..126dd6a9ec9b 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -37,7 +37,7 @@ ENTRY(__put_user_1) ENTER cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 1: movb %al,(%_ASM_CX) xor %eax,%eax @@ -51,7 +51,7 @@ ENTRY(__put_user_2) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 2: movw %ax,(%_ASM_CX) xor %eax,%eax @@ -65,7 +65,7 @@ ENTRY(__put_user_4) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 3: movl %eax,(%_ASM_CX) xor %eax,%eax @@ -79,7 +79,7 @@ ENTRY(__put_user_8) mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX - jae bad_put_user + jae .Lbad_put_user ASM_STAC 4: mov %_ASM_AX,(%_ASM_CX) #ifdef CONFIG_X86_32 @@ -91,16 +91,16 @@ ENTRY(__put_user_8) ENDPROC(__put_user_8) EXPORT_SYMBOL(__put_user_8) -bad_put_user_clac: +.Lbad_put_user_clac: ASM_CLAC -bad_put_user: +.Lbad_put_user: movl $-EFAULT,%eax RET - _ASM_EXTABLE_UA(1b, bad_put_user_clac) - _ASM_EXTABLE_UA(2b, bad_put_user_clac) - _ASM_EXTABLE_UA(3b, bad_put_user_clac) - _ASM_EXTABLE_UA(4b, bad_put_user_clac) + _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac) + _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac) #ifdef CONFIG_X86_32 - _ASM_EXTABLE_UA(5b, bad_put_user_clac) + _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac) #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 63e99f15d7cf..a39dcdb5ae34 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -19,6 +19,7 @@ #include <asm/set_memory.h> #include <asm/e820/api.h> +#include <asm/efi.h> #include <asm/fixmap.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index fece30ca8b0c..9268c12458c8 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -344,13 +344,11 @@ bool sme_active(void) { return sme_me_mask && !sev_enabled; } -EXPORT_SYMBOL(sme_active); bool sev_active(void) { return sme_me_mask && sev_enabled; } -EXPORT_SYMBOL(sev_active); /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index fa16036fa592..65ebe4b88f7c 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -54,23 +54,10 @@ static u64 get_subtree_max_end(struct rb_node *node) return ret; } -static u64 compute_subtree_max_end(struct memtype *data) -{ - u64 max_end = data->end, child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_right); - if (child_max_end > max_end) - max_end = child_max_end; - - child_max_end = get_subtree_max_end(data->rb.rb_left); - if (child_max_end > max_end) - max_end = child_max_end; - - return max_end; -} +#define NODE_END(node) ((node)->end) -RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb, - u64, subtree_max_end, compute_subtree_max_end) +RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, + struct memtype, rb, u64, subtree_max_end, NODE_END) /* Find the first (lowest start addr) overlapping range from rb tree */ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..3e4b9035bb9a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -45,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_page_dtor(pte); + pgtable_pte_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -void __init pgd_cache_init(void) +void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use @@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) } #else -void __init pgd_cache_init(void) -{ -} - static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index b196524759ec..7f2140414440 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -330,13 +330,15 @@ pti_clone_pgtable(unsigned long start, unsigned long end, pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - addr += PUD_SIZE; + WARN_ON_ONCE(addr & ~PUD_MASK); + addr = round_up(addr + 1, PUD_SIZE); continue; } pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - addr += PMD_SIZE; + WARN_ON_ONCE(addr & ~PMD_MASK); + addr = round_up(addr + 1, PMD_SIZE); continue; } @@ -666,6 +668,8 @@ void __init pti_init(void) */ void pti_finalize(void) { + if (!boot_cpu_has(X86_FEATURE_PTI)) + return; /* * We need to clone everything (again) that maps parts of the * kernel image. diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 0881e1ff1e58..a8bd952e136d 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/io.h> #include <linux/mmiotrace.h> +#include <linux/security.h> static unsigned long mmio_address; module_param_hw(mmio_address, ulong, iomem, 0); @@ -115,6 +116,10 @@ static void do_test_bulk_ioremapping(void) static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + int ret = security_locked_down(LOCKDOWN_MMIOTRACE); + + if (ret) + return ret; if (mmio_address == 0) { pr_err("you have to use the module argument mmio_address.\n"); diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 7389db538c30..6fa42e9c4e6f 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -29,6 +29,7 @@ static bool pci_mmcfg_running_state; static bool pci_mmcfg_arch_init_failed; static DEFINE_MUTEX(pci_mmcfg_lock); +#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map) LIST_HEAD(pci_mmcfg_list); @@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new) struct pci_mmcfg_region *cfg; /* keep list sorted by segment and starting bus number */ - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) { + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) { if (cfg->segment > new->segment || (cfg->segment == new->segment && cfg->start_bus >= new->start_bus)) { @@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus) { struct pci_mmcfg_region *cfg; - list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list) + list_for_each_entry_rcu(cfg, &pci_mmcfg_list, list, pci_mmcfg_lock_held()) if (cfg->segment == segment && cfg->start_bus <= bus && bus <= cfg->end_bus) return cfg; diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 97bbc12dd6b2..6269a175385d 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * arch/x86/pci/sta2x11-fixup.c - * glue code for lib/swiotlb.c and DMA translation between STA2x11 - * AMBA memory mapping and the X86 memory mapping + * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping * * ST Microelectronics ConneXt (STA2X11/STA2X10) * diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a7189a3b4d70..425e025341db 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -59,11 +59,34 @@ static efi_system_table_t efi_systab __initdata; static efi_config_table_type_t arch_tables[] __initdata = { #ifdef CONFIG_X86_UV - {UV_SYSTEM_TABLE_GUID, "UVsystab", &efi.uv_systab}, + {UV_SYSTEM_TABLE_GUID, "UVsystab", &uv_systab_phys}, #endif {NULL_GUID, NULL, NULL}, }; +static const unsigned long * const efi_tables[] = { + &efi.mps, + &efi.acpi, + &efi.acpi20, + &efi.smbios, + &efi.smbios3, + &efi.boot_info, + &efi.hcdp, + &efi.uga, +#ifdef CONFIG_X86_UV + &uv_systab_phys, +#endif + &efi.fw_vendor, + &efi.runtime, + &efi.config_table, + &efi.esrt, + &efi.properties_table, + &efi.mem_attr_table, +#ifdef CONFIG_EFI_RCI2_TABLE + &rci2_table_phys, +#endif +}; + u64 efi_setup; /* efi setup_data physical address */ static int add_efi_memmap __initdata; @@ -894,9 +917,6 @@ static void __init kexec_enter_virtual_mode(void) if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); - - /* clean DUMMY object */ - efi_delete_dummy_variable(); #endif } @@ -1049,3 +1069,17 @@ static int __init arch_parse_efi_cmdline(char *str) return 0; } early_param("efi", arch_parse_efi_cmdline); + +bool efi_is_table_address(unsigned long phys_addr) +{ + unsigned int i; + + if (phys_addr == EFI_INVALID_TABLE_ADDR) + return false; + + for (i = 0; i < ARRAY_SIZE(efi_tables); i++) + if (*(efi_tables[i]) == phys_addr) + return true; + + return false; +} diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c index 2e796b54cbde..9e2444500428 100644 --- a/arch/x86/platform/intel/iosf_mbi.c +++ b/arch/x86/platform/intel/iosf_mbi.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/capability.h> #include <linux/pm_qos.h> +#include <linux/wait.h> #include <asm/iosf_mbi.h> @@ -201,23 +202,45 @@ EXPORT_SYMBOL(iosf_mbi_available); #define PUNIT_SEMAPHORE_BIT BIT(0) #define PUNIT_SEMAPHORE_ACQUIRE BIT(1) -static DEFINE_MUTEX(iosf_mbi_punit_mutex); -static DEFINE_MUTEX(iosf_mbi_block_punit_i2c_access_count_mutex); +static DEFINE_MUTEX(iosf_mbi_pmic_access_mutex); static BLOCKING_NOTIFIER_HEAD(iosf_mbi_pmic_bus_access_notifier); -static u32 iosf_mbi_block_punit_i2c_access_count; +static DECLARE_WAIT_QUEUE_HEAD(iosf_mbi_pmic_access_waitq); +static u32 iosf_mbi_pmic_punit_access_count; +static u32 iosf_mbi_pmic_i2c_access_count; static u32 iosf_mbi_sem_address; static unsigned long iosf_mbi_sem_acquired; static struct pm_qos_request iosf_mbi_pm_qos; void iosf_mbi_punit_acquire(void) { - mutex_lock(&iosf_mbi_punit_mutex); + /* Wait for any I2C PMIC accesses from in kernel drivers to finish. */ + mutex_lock(&iosf_mbi_pmic_access_mutex); + while (iosf_mbi_pmic_i2c_access_count != 0) { + mutex_unlock(&iosf_mbi_pmic_access_mutex); + wait_event(iosf_mbi_pmic_access_waitq, + iosf_mbi_pmic_i2c_access_count == 0); + mutex_lock(&iosf_mbi_pmic_access_mutex); + } + /* + * We do not need to do anything to allow the PUNIT to safely access + * the PMIC, other then block in kernel accesses to the PMIC. + */ + iosf_mbi_pmic_punit_access_count++; + mutex_unlock(&iosf_mbi_pmic_access_mutex); } EXPORT_SYMBOL(iosf_mbi_punit_acquire); void iosf_mbi_punit_release(void) { - mutex_unlock(&iosf_mbi_punit_mutex); + bool do_wakeup; + + mutex_lock(&iosf_mbi_pmic_access_mutex); + iosf_mbi_pmic_punit_access_count--; + do_wakeup = iosf_mbi_pmic_punit_access_count == 0; + mutex_unlock(&iosf_mbi_pmic_access_mutex); + + if (do_wakeup) + wake_up(&iosf_mbi_pmic_access_waitq); } EXPORT_SYMBOL(iosf_mbi_punit_release); @@ -256,34 +279,32 @@ static void iosf_mbi_reset_semaphore(void) * already blocked P-Unit accesses because it wants them blocked over multiple * i2c-transfers, for e.g. read-modify-write of an I2C client register. * - * The P-Unit accesses already being blocked is tracked through the - * iosf_mbi_block_punit_i2c_access_count variable which is protected by the - * iosf_mbi_block_punit_i2c_access_count_mutex this mutex is hold for the - * entire duration of the function. - * - * If access is not blocked yet, this function takes the following steps: + * To allow safe PMIC i2c bus accesses this function takes the following steps: * * 1) Some code sends request to the P-Unit which make it access the PMIC * I2C bus. Testing has shown that the P-Unit does not check its internal * PMIC bus semaphore for these requests. Callers of these requests call * iosf_mbi_punit_acquire()/_release() around their P-Unit accesses, these - * functions lock/unlock the iosf_mbi_punit_mutex. - * As the first step we lock the iosf_mbi_punit_mutex, to wait for any in - * flight requests to finish and to block any new requests. + * functions increase/decrease iosf_mbi_pmic_punit_access_count, so first + * we wait for iosf_mbi_pmic_punit_access_count to become 0. + * + * 2) Check iosf_mbi_pmic_i2c_access_count, if access has already + * been blocked by another caller, we only need to increment + * iosf_mbi_pmic_i2c_access_count and we can skip the other steps. * - * 2) Some code makes such P-Unit requests from atomic contexts where it + * 3) Some code makes such P-Unit requests from atomic contexts where it * cannot call iosf_mbi_punit_acquire() as that may sleep. * As the second step we call a notifier chain which allows any code * needing P-Unit resources from atomic context to acquire them before * we take control over the PMIC I2C bus. * - * 3) When CPU cores enter C6 or C7 the P-Unit needs to talk to the PMIC + * 4) When CPU cores enter C6 or C7 the P-Unit needs to talk to the PMIC * if this happens while the kernel itself is accessing the PMIC I2C bus * the SoC hangs. * As the third step we call pm_qos_update_request() to disallow the CPU * to enter C6 or C7. * - * 4) The P-Unit has a PMIC bus semaphore which we can request to stop + * 5) The P-Unit has a PMIC bus semaphore which we can request to stop * autonomous P-Unit tasks from accessing the PMIC I2C bus while we hold it. * As the fourth and final step we request this semaphore and wait for our * request to be acknowledged. @@ -297,12 +318,18 @@ int iosf_mbi_block_punit_i2c_access(void) if (WARN_ON(!mbi_pdev || !iosf_mbi_sem_address)) return -ENXIO; - mutex_lock(&iosf_mbi_block_punit_i2c_access_count_mutex); + mutex_lock(&iosf_mbi_pmic_access_mutex); - if (iosf_mbi_block_punit_i2c_access_count > 0) + while (iosf_mbi_pmic_punit_access_count != 0) { + mutex_unlock(&iosf_mbi_pmic_access_mutex); + wait_event(iosf_mbi_pmic_access_waitq, + iosf_mbi_pmic_punit_access_count == 0); + mutex_lock(&iosf_mbi_pmic_access_mutex); + } + + if (iosf_mbi_pmic_i2c_access_count > 0) goto success; - mutex_lock(&iosf_mbi_punit_mutex); blocking_notifier_call_chain(&iosf_mbi_pmic_bus_access_notifier, MBI_PMIC_BUS_ACCESS_BEGIN, NULL); @@ -330,10 +357,6 @@ int iosf_mbi_block_punit_i2c_access(void) iosf_mbi_sem_acquired = jiffies; dev_dbg(&mbi_pdev->dev, "P-Unit semaphore acquired after %ums\n", jiffies_to_msecs(jiffies - start)); - /* - * Success, keep iosf_mbi_punit_mutex locked till - * iosf_mbi_unblock_punit_i2c_access() gets called. - */ goto success; } @@ -344,15 +367,13 @@ int iosf_mbi_block_punit_i2c_access(void) dev_err(&mbi_pdev->dev, "Error P-Unit semaphore timed out, resetting\n"); error: iosf_mbi_reset_semaphore(); - mutex_unlock(&iosf_mbi_punit_mutex); - if (!iosf_mbi_get_sem(&sem)) dev_err(&mbi_pdev->dev, "P-Unit semaphore: %d\n", sem); success: if (!WARN_ON(ret)) - iosf_mbi_block_punit_i2c_access_count++; + iosf_mbi_pmic_i2c_access_count++; - mutex_unlock(&iosf_mbi_block_punit_i2c_access_count_mutex); + mutex_unlock(&iosf_mbi_pmic_access_mutex); return ret; } @@ -360,17 +381,20 @@ EXPORT_SYMBOL(iosf_mbi_block_punit_i2c_access); void iosf_mbi_unblock_punit_i2c_access(void) { - mutex_lock(&iosf_mbi_block_punit_i2c_access_count_mutex); + bool do_wakeup = false; - iosf_mbi_block_punit_i2c_access_count--; - if (iosf_mbi_block_punit_i2c_access_count == 0) { + mutex_lock(&iosf_mbi_pmic_access_mutex); + iosf_mbi_pmic_i2c_access_count--; + if (iosf_mbi_pmic_i2c_access_count == 0) { iosf_mbi_reset_semaphore(); - mutex_unlock(&iosf_mbi_punit_mutex); dev_dbg(&mbi_pdev->dev, "punit semaphore held for %ums\n", jiffies_to_msecs(jiffies - iosf_mbi_sem_acquired)); + do_wakeup = true; } + mutex_unlock(&iosf_mbi_pmic_access_mutex); - mutex_unlock(&iosf_mbi_block_punit_i2c_access_count_mutex); + if (do_wakeup) + wake_up(&iosf_mbi_pmic_access_waitq); } EXPORT_SYMBOL(iosf_mbi_unblock_punit_i2c_access); @@ -379,10 +403,10 @@ int iosf_mbi_register_pmic_bus_access_notifier(struct notifier_block *nb) int ret; /* Wait for the bus to go inactive before registering */ - mutex_lock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_acquire(); ret = blocking_notifier_chain_register( &iosf_mbi_pmic_bus_access_notifier, nb); - mutex_unlock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_release(); return ret; } @@ -403,9 +427,9 @@ int iosf_mbi_unregister_pmic_bus_access_notifier(struct notifier_block *nb) int ret; /* Wait for the bus to go inactive before unregistering */ - mutex_lock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_acquire(); ret = iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(nb); - mutex_unlock(&iosf_mbi_punit_mutex); + iosf_mbi_punit_release(); return ret; } @@ -413,7 +437,7 @@ EXPORT_SYMBOL(iosf_mbi_unregister_pmic_bus_access_notifier); void iosf_mbi_assert_punit_acquired(void) { - WARN_ON(!mutex_is_locked(&iosf_mbi_punit_mutex)); + WARN_ON(iosf_mbi_pmic_punit_access_count == 0); } EXPORT_SYMBOL(iosf_mbi_assert_punit_acquired); diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 25ce1b3b0732..99a28ce2244c 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -157,6 +157,12 @@ static ssize_t lid_wake_mode_set(struct device *dev, static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show, lid_wake_mode_set); +static struct attribute *lid_attrs[] = { + &dev_attr_lid_wake_mode.attr, + NULL, +}; +ATTRIBUTE_GROUPS(lid); + /* * Process all items in the EC's SCI queue. * @@ -510,17 +516,8 @@ static int setup_lid_switch(struct platform_device *pdev) goto err_register; } - r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode); - if (r) { - dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r); - goto err_create_attr; - } - return 0; -err_create_attr: - input_unregister_device(lid_switch_idev); - lid_switch_idev = NULL; err_register: input_free_device(lid_switch_idev); return r; @@ -528,7 +525,6 @@ err_register: static void free_lid_switch(void) { - device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode); input_unregister_device(lid_switch_idev); } @@ -624,6 +620,7 @@ static int xo1_sci_remove(struct platform_device *pdev) static struct platform_driver xo1_sci_driver = { .driver = { .name = "olpc-xo1-sci-acpi", + .dev_groups = lid_groups, }, .probe = xo1_sci_probe, .remove = xo1_sci_remove, diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 7c69652ffeea..c2ee31953372 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -14,6 +14,8 @@ #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> +unsigned long uv_systab_phys __ro_after_init = EFI_INVALID_TABLE_ADDR; + struct uv_systab *uv_systab; static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, @@ -185,13 +187,13 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - !efi.uv_systab || efi_runtime_disabled()) { + if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || + !uv_systab_phys || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } - uv_systab = ioremap(efi.uv_systab, sizeof(struct uv_systab)); + uv_systab = ioremap(uv_systab_phys, sizeof(struct uv_systab)); if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { pr_err("UV: UVsystab: bad signature!\n"); iounmap(uv_systab); @@ -203,7 +205,7 @@ void uv_bios_init(void) int size = uv_systab->size; iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, size); + uv_systab = ioremap(uv_systab_phys, size); if (!uv_systab) { pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 20c389a91b80..5f0a96bf27a1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1804,9 +1804,9 @@ static void pq_init(int node, int pnode) plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); vp = kmalloc_node(plsize, GFP_KERNEL, node); - pqp = (struct bau_pq_entry *)vp; - BUG_ON(!pqp); + BUG_ON(!vp); + pqp = (struct bau_pq_entry *)vp; cp = (char *)pqp + 31; pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index c9ef6a7a4a1a..915bb1639763 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -123,9 +123,6 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr2 = read_cr2(); ctxt->cr3 = __read_cr3(); ctxt->cr4 = __read_cr4(); -#ifdef CONFIG_X86_64 - ctxt->cr8 = read_cr8(); -#endif ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, &ctxt->misc_enable); msr_save_context(ctxt); @@ -208,7 +205,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) #else /* CONFIG X86_64 */ wrmsrl(MSR_EFER, ctxt->efer); - write_cr8(ctxt->cr8); __write_cr4(ctxt->cr4); #endif write_cr3(ctxt->cr3); diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 8901a1f89cf5..fb4ee5444379 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -9,46 +9,52 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/string.o: $(srctree)/arch/x86/boot/compressed/string.c FORCE $(call if_changed_rule,cc_o_c) -$(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE +$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) +CFLAGS_sha256.o := -D__DISABLE_EXPORTS + LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro KASAN_SANITIZE := n KCOV_INSTRUMENT := n +# These are adjustments to the compiler flags used for objects that +# make up the standalone purgatory.ro + +PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel +PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss +PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) + # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not # sure how to relocate those. ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_sha256.o += $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_purgatory.o += $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_string.o += $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_kexec-purgatory.o += $(CC_FLAGS_FTRACE) +PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_FTRACE) endif ifdef CONFIG_STACKPROTECTOR -CFLAGS_REMOVE_sha256.o += -fstack-protector -CFLAGS_REMOVE_purgatory.o += -fstack-protector -CFLAGS_REMOVE_string.o += -fstack-protector -CFLAGS_REMOVE_kexec-purgatory.o += -fstack-protector +PURGATORY_CFLAGS_REMOVE += -fstack-protector endif ifdef CONFIG_STACKPROTECTOR_STRONG -CFLAGS_REMOVE_sha256.o += -fstack-protector-strong -CFLAGS_REMOVE_purgatory.o += -fstack-protector-strong -CFLAGS_REMOVE_string.o += -fstack-protector-strong -CFLAGS_REMOVE_kexec-purgatory.o += -fstack-protector-strong +PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif ifdef CONFIG_RETPOLINE -CFLAGS_REMOVE_sha256.o += $(RETPOLINE_CFLAGS) -CFLAGS_REMOVE_purgatory.o += $(RETPOLINE_CFLAGS) -CFLAGS_REMOVE_string.o += $(RETPOLINE_CFLAGS) -CFLAGS_REMOVE_kexec-purgatory.o += $(RETPOLINE_CFLAGS) +PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif +CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_purgatory.o += $(PURGATORY_CFLAGS) + +CFLAGS_REMOVE_sha256.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_sha256.o += $(PURGATORY_CFLAGS) + +CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) +CFLAGS_string.o += $(PURGATORY_CFLAGS) + $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index b607bda786f6..3b95410ff0f8 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,7 +9,7 @@ */ #include <linux/bug.h> -#include <linux/sha256.h> +#include <crypto/sha.h> #include <asm/purgatory.h> #include "../boot/string.h" diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 30b0d30d861a..6363761cc74c 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -19,7 +19,6 @@ GLOBAL(real_mode_header) .long pa_ro_end /* SMP trampoline */ .long pa_trampoline_start - .long pa_trampoline_status .long pa_trampoline_header #ifdef CONFIG_X86_64 .long pa_trampoline_pgd; diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 2dd866c9e21e..1868b158480d 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -41,9 +41,6 @@ ENTRY(trampoline_start) movl tr_start, %eax # where we need to go - movl $0xA5A5A5A5, trampoline_status - # write marker for master knows we're running - /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 24bb7598774e..aee2b45d83b8 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -49,9 +49,6 @@ ENTRY(trampoline_start) mov %ax, %es mov %ax, %ss - movl $0xA5A5A5A5, trampoline_status - # write marker for master knows we're running - # Setup stack movl $rm_stack_end, %esp diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 7c706772ab59..8d8208dcca24 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -2,7 +2,3 @@ .section ".rodata","a" .balign 16 tr_idt: .fill 1, 6, 0 - - .bss - .balign 4 -GLOBAL(trampoline_status) .space 4 diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index f31e5d903161..165be7f9a964 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -2,14 +2,7 @@ #ifndef _ASM_UM_BARRIER_H_ #define _ASM_UM_BARRIER_H_ -#include <asm/asm.h> -#include <asm/segment.h> -#include <asm/cpufeatures.h> -#include <asm/cmpxchg.h> -#include <asm/nops.h> - -#include <linux/kernel.h> -#include <linux/irqflags.h> +#include <asm/alternative.h> /* * Force strict CPU ordering. @@ -30,9 +23,6 @@ #endif /* CONFIG_X86_32 */ -#define dma_rmb() barrier() -#define dma_wmb() barrier() - #include <asm-generic/barrier.h> #endif diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index ac9c02b9d92c..891868756a51 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -47,7 +47,7 @@ time_t __vdso_time(time_t *t) return secs; } -int time(time_t *t) __attribute__((weak, alias("__vdso_time"))); +time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 0d3365cb64de..a04551ee5568 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -57,19 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void) return NULL; /* Here we know that Xen runs on EFI platform. */ - - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; + xen_efi_runtime_setup(); efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 750f46ad018a..205b1176084f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -269,19 +269,41 @@ void xen_reboot(int reason) BUG(); } +static int reboot_reason = SHUTDOWN_reboot; +static bool xen_legacy_crash; void xen_emergency_restart(void) { - xen_reboot(SHUTDOWN_reboot); + xen_reboot(reboot_reason); } static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - if (!kexec_crash_loaded()) - xen_reboot(SHUTDOWN_crash); + if (!kexec_crash_loaded()) { + if (xen_legacy_crash) + xen_reboot(SHUTDOWN_crash); + + reboot_reason = SHUTDOWN_crash; + + /* + * If panic_timeout==0 then we are supposed to wait forever. + * However, to preserve original dom0 behavior we have to drop + * into hypervisor. (domU behavior is controlled by its + * config file) + */ + if (panic_timeout == 0) + panic_timeout = -1; + } return NOTIFY_DONE; } +static int __init parse_xen_legacy_crash(char *arg) +{ + xen_legacy_crash = true; + return 0; +} +early_param("xen_legacy_crash", parse_xen_legacy_crash); + static struct notifier_block xen_panic_block = { .notifier_call = xen_panic_event, .priority = INT_MIN diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 7ceb32821093..5bfea374a160 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -117,6 +117,14 @@ static void __init xen_banner(void) printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); + +#ifdef CONFIG_X86_32 + pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n" + "Support for running as 32-bit PV-guest under Xen will soon be removed\n" + "from the Linux kernel!\n" + "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n" + "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"); +#endif } static void __init xen_pv_init_platform(void) @@ -877,16 +885,6 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -#ifdef CONFIG_X86_64 -static inline unsigned long xen_read_cr8(void) -{ - return 0; -} -static inline void xen_write_cr8(unsigned long val) -{ - BUG_ON(val); -} -#endif static u64 xen_read_msr_safe(unsigned int msr, int *err) { @@ -1023,11 +1021,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, -#ifdef CONFIG_X86_64 - .read_cr8 = xen_read_cr8, - .write_cr8 = xen_write_cr8, -#endif - .wbinvd = native_wbinvd, .read_msr = xen_read_msr, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 26e8b326966d..c8dbee62ec2a 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2625,7 +2625,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, *dma_handle = virt_to_machine(vstart).maddr; return success ? 0 : -ENOMEM; } -EXPORT_SYMBOL_GPL(xen_create_contiguous_region); void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { @@ -2660,7 +2659,6 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_unlock_irqrestore(&xen_reservation_lock, flags); } -EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); static noinline void xen_flush_tlb_all(void) { |