From bdd27cad22379a665fe246a0f9b37acb502df0fe Mon Sep 17 00:00:00 2001 From: Clinton Taylor Date: Thu, 6 May 2021 19:19:23 +0300 Subject: drm/i915/adl_p: ADL_P device info enabling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ADL-P to the device_info table and support MACROS. Bspec: 49185, 55372, 55373 Cc: Matt Atwood Cc: Matt Roper Signed-off-by: Clinton Taylor Signed-off-by: Matt Roper Reviewed-by: José Roberto de Souza Signed-off-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20210506161930.309688-4-imre.deak@intel.com --- arch/x86/kernel/early-quirks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6edd1e2ee8af..b553ffe9b985 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -552,6 +552,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_TGL_12_IDS(&gen11_early_ops), INTEL_RKL_IDS(&gen11_early_ops), INTEL_ADLS_IDS(&gen11_early_ops), + INTEL_ADLP_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); -- cgit v1.2.3 From e759959fe3b8313c81d6200be44cb8a644d845ea Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:34 -0500 Subject: x86/sev-es: Rename sev-es.{ch} to sev.{ch} SEV-SNP builds upon the SEV-ES functionality while adding new hardware protection. Version 2 of the GHCB specification adds new NAE events that are SEV-SNP specific. Rename the sev-es.{ch} to sev.{ch} so that all SEV* functionality can be consolidated in one place. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-2-brijesh.singh@amd.com --- arch/x86/boot/compressed/Makefile | 6 +- arch/x86/boot/compressed/sev-es.c | 206 ------ arch/x86/boot/compressed/sev.c | 206 ++++++ arch/x86/include/asm/sev-es.h | 114 --- arch/x86/include/asm/sev.h | 114 +++ arch/x86/kernel/Makefile | 6 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/sev-es-shared.c | 525 ------------- arch/x86/kernel/sev-es.c | 1461 ------------------------------------- arch/x86/kernel/sev-shared.c | 525 +++++++++++++ arch/x86/kernel/sev.c | 1461 +++++++++++++++++++++++++++++++++++++ arch/x86/mm/extable.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- arch/x86/realmode/init.c | 2 +- 15 files changed, 2317 insertions(+), 2317 deletions(-) delete mode 100644 arch/x86/boot/compressed/sev-es.c create mode 100644 arch/x86/boot/compressed/sev.c delete mode 100644 arch/x86/include/asm/sev-es.h create mode 100644 arch/x86/include/asm/sev.h delete mode 100644 arch/x86/kernel/sev-es-shared.c delete mode 100644 arch/x86/kernel/sev-es.c create mode 100644 arch/x86/kernel/sev-shared.c create mode 100644 arch/x86/kernel/sev.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6e5522aebbbd..2a2975236c9e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -48,10 +48,10 @@ KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no) KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h KBUILD_CFLAGS += $(CLANG_FLAGS) -# sev-es.c indirectly inludes inat-table.h which is generated during +# sev.c indirectly inludes inat-table.h which is generated during # compilation and stored in $(objtree). Add the directory to the includes so # that the compiler finds it even with out-of-tree builds (make O=/some/path). -CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/ +CFLAGS_sev.o += -I$(objtree)/arch/x86/lib/ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -93,7 +93,7 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o - vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o + vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c deleted file mode 100644 index 82041bd380e5..000000000000 --- a/arch/x86/boot/compressed/sev-es.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - */ - -/* - * misc.h needs to be first because it knows how to include the other kernel - * headers in the pre-decompression code in a way that does not break - * compilation. - */ -#include "misc.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "error.h" - -struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); -struct ghcb *boot_ghcb; - -/* - * Copy a version of this function here - insn-eval.c can't be used in - * pre-decompression code. - */ -static bool insn_has_rep_prefix(struct insn *insn) -{ - insn_byte_t p; - int i; - - insn_get_prefixes(insn); - - for_each_insn_prefix(insn, i, p) { - if (p == 0xf2 || p == 0xf3) - return true; - } - - return false; -} - -/* - * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and - * doesn't use segments. - */ -static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) -{ - return 0UL; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - unsigned long low, high; - - asm volatile("rdmsr" : "=a" (low), "=d" (high) : - "c" (MSR_AMD64_SEV_ES_GHCB)); - - return ((high << 32) | low); -} - -static inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = val & 0xffffffffUL; - high = val >> 32; - - asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), - "a"(low), "d" (high) : "memory"); -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int ret; - - memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - - return ES_OK; -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - void *dst, char *buf, size_t size) -{ - memcpy(dst, buf, size); - - return ES_OK; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - void *src, char *buf, size_t size) -{ - memcpy(buf, src, size); - - return ES_OK; -} - -#undef __init -#undef __pa -#define __init -#define __pa(x) ((unsigned long)(x)) - -#define __BOOT_COMPRESSED - -/* Basic instruction decoding support needed */ -#include "../../lib/inat.c" -#include "../../lib/insn.c" - -/* Include code for early handlers */ -#include "../../kernel/sev-es-shared.c" - -static bool early_setup_sev_es(void) -{ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); - - if (set_page_decrypted((unsigned long)&boot_ghcb_page)) - return false; - - /* Page is now mapped decrypted, clear it */ - memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); - - boot_ghcb = &boot_ghcb_page; - - /* Initialize lookup tables for the instruction decoder */ - inat_init_tables(); - - return true; -} - -void sev_es_shutdown_ghcb(void) -{ - if (!boot_ghcb) - return; - - if (!sev_es_check_cpu_features()) - error("SEV-ES CPU Features missing."); - - /* - * GHCB Page must be flushed from the cache and mapped encrypted again. - * Otherwise the running kernel will see strange cache effects when - * trying to use that page. - */ - if (set_page_encrypted((unsigned long)&boot_ghcb_page)) - error("Can't map GHCB page encrypted"); - - /* - * GHCB page is mapped encrypted again and flushed from the cache. - * Mark it non-present now to catch bugs when #VC exceptions trigger - * after this point. - */ - if (set_page_non_present((unsigned long)&boot_ghcb_page)) - error("Can't unmap GHCB page"); -} - -bool sev_es_check_ghcb_fault(unsigned long address) -{ - /* Check whether the fault was on the GHCB page */ - return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); -} - -void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) -{ - struct es_em_ctxt ctxt; - enum es_result result; - - if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - vc_ghcb_invalidate(boot_ghcb); - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result != ES_OK) - goto finish; - - switch (exit_code) { - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(boot_ghcb, &ctxt); - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(boot_ghcb, &ctxt); - break; - default: - result = ES_UNSUPPORTED; - break; - } - -finish: - if (result == ES_OK) - vc_finish_insn(&ctxt); - else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); -} diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c new file mode 100644 index 000000000000..670e998fe930 --- /dev/null +++ b/arch/x86/boot/compressed/sev.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +/* + * misc.h needs to be first because it knows how to include the other kernel + * headers in the pre-decompression code in a way that does not break + * compilation. + */ +#include "misc.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "error.h" + +struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); +struct ghcb *boot_ghcb; + +/* + * Copy a version of this function here - insn-eval.c can't be used in + * pre-decompression code. + */ +static bool insn_has_rep_prefix(struct insn *insn) +{ + insn_byte_t p; + int i; + + insn_get_prefixes(insn); + + for_each_insn_prefix(insn, i, p) { + if (p == 0xf2 || p == 0xf3) + return true; + } + + return false; +} + +/* + * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and + * doesn't use segments. + */ +static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx) +{ + return 0UL; +} + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + unsigned long low, high; + + asm volatile("rdmsr" : "=a" (low), "=d" (high) : + "c" (MSR_AMD64_SEV_ES_GHCB)); + + return ((high << 32) | low); +} + +static inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = val & 0xffffffffUL; + high = val >> 32; + + asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB), + "a"(low), "d" (high) : "memory"); +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int ret; + + memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + + return ES_OK; +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + void *dst, char *buf, size_t size) +{ + memcpy(dst, buf, size); + + return ES_OK; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + void *src, char *buf, size_t size) +{ + memcpy(buf, src, size); + + return ES_OK; +} + +#undef __init +#undef __pa +#define __init +#define __pa(x) ((unsigned long)(x)) + +#define __BOOT_COMPRESSED + +/* Basic instruction decoding support needed */ +#include "../../lib/inat.c" +#include "../../lib/insn.c" + +/* Include code for early handlers */ +#include "../../kernel/sev-shared.c" + +static bool early_setup_sev_es(void) +{ + if (!sev_es_negotiate_protocol()) + sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED); + + if (set_page_decrypted((unsigned long)&boot_ghcb_page)) + return false; + + /* Page is now mapped decrypted, clear it */ + memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page)); + + boot_ghcb = &boot_ghcb_page; + + /* Initialize lookup tables for the instruction decoder */ + inat_init_tables(); + + return true; +} + +void sev_es_shutdown_ghcb(void) +{ + if (!boot_ghcb) + return; + + if (!sev_es_check_cpu_features()) + error("SEV-ES CPU Features missing."); + + /* + * GHCB Page must be flushed from the cache and mapped encrypted again. + * Otherwise the running kernel will see strange cache effects when + * trying to use that page. + */ + if (set_page_encrypted((unsigned long)&boot_ghcb_page)) + error("Can't map GHCB page encrypted"); + + /* + * GHCB page is mapped encrypted again and flushed from the cache. + * Mark it non-present now to catch bugs when #VC exceptions trigger + * after this point. + */ + if (set_page_non_present((unsigned long)&boot_ghcb_page)) + error("Can't unmap GHCB page"); +} + +bool sev_es_check_ghcb_fault(unsigned long address) +{ + /* Check whether the fault was on the GHCB page */ + return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page); +} + +void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) +{ + struct es_em_ctxt ctxt; + enum es_result result; + + if (!boot_ghcb && !early_setup_sev_es()) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result != ES_OK) + goto finish; + + switch (exit_code) { + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(boot_ghcb, &ctxt); + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(boot_ghcb, &ctxt); + break; + default: + result = ES_UNSUPPORTED; + break; + } + +finish: + if (result == ES_OK) + vc_finish_insn(&ctxt); + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); +} diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h deleted file mode 100644 index cf1d957c7091..000000000000 --- a/arch/x86/include/asm/sev-es.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - */ - -#ifndef __ASM_ENCRYPTED_STATE_H -#define __ASM_ENCRYPTED_STATE_H - -#include -#include - -#define GHCB_SEV_INFO 0x001UL -#define GHCB_SEV_INFO_REQ 0x002UL -#define GHCB_INFO(v) ((v) & 0xfffUL) -#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) -#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_SEV_CPUID_REQ 0x004UL -#define GHCB_CPUID_REQ_EAX 0 -#define GHCB_CPUID_REQ_EBX 1 -#define GHCB_CPUID_REQ_ECX 2 -#define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ - (((unsigned long)reg & 3) << 30) | \ - (((unsigned long)fn) << 32)) - -#define GHCB_PROTOCOL_MAX 0x0001UL -#define GHCB_DEFAULT_USAGE 0x0000UL - -#define GHCB_SEV_CPUID_RESP 0x005UL -#define GHCB_SEV_TERMINATE 0x100UL -#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & 0x7) << 12) | \ - ((((u64)reason_val) & 0xff) << 16)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 - -#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) -#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } - -enum es_result { - ES_OK, /* All good */ - ES_UNSUPPORTED, /* Requested operation not supported */ - ES_VMM_ERROR, /* Unexpected state from the VMM */ - ES_DECODE_FAILED, /* Instruction decoding failed */ - ES_EXCEPTION, /* Instruction caused exception */ - ES_RETRY, /* Retry instruction emulation */ -}; - -struct es_fault_info { - unsigned long vector; - unsigned long error_code; - unsigned long cr2; -}; - -struct pt_regs; - -/* ES instruction emulation context */ -struct es_em_ctxt { - struct pt_regs *regs; - struct insn insn; - struct es_fault_info fi; -}; - -void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); - -static inline u64 lower_bits(u64 val, unsigned int bits) -{ - u64 mask = (1ULL << bits) - 1; - - return (val & mask); -} - -struct real_mode_header; -enum stack_type; - -/* Early IDT entry points for #VC handler */ -extern void vc_no_ghcb(void); -extern void vc_boot_ghcb(void); -extern bool handle_vc_boot_ghcb(struct pt_regs *regs); - -#ifdef CONFIG_AMD_MEM_ENCRYPT -extern struct static_key_false sev_es_enable_key; -extern void __sev_es_ist_enter(struct pt_regs *regs); -extern void __sev_es_ist_exit(void); -static __always_inline void sev_es_ist_enter(struct pt_regs *regs) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_ist_enter(regs); -} -static __always_inline void sev_es_ist_exit(void) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_ist_exit(); -} -extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); -extern void __sev_es_nmi_complete(void); -static __always_inline void sev_es_nmi_complete(void) -{ - if (static_branch_unlikely(&sev_es_enable_key)) - __sev_es_nmi_complete(); -} -extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); -#else -static inline void sev_es_ist_enter(struct pt_regs *regs) { } -static inline void sev_es_ist_exit(void) { } -static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } -static inline void sev_es_nmi_complete(void) { } -static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } -#endif - -#endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h new file mode 100644 index 000000000000..cf1d957c7091 --- /dev/null +++ b/arch/x86/include/asm/sev.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + */ + +#ifndef __ASM_ENCRYPTED_STATE_H +#define __ASM_ENCRYPTED_STATE_H + +#include +#include + +#define GHCB_SEV_INFO 0x001UL +#define GHCB_SEV_INFO_REQ 0x002UL +#define GHCB_INFO(v) ((v) & 0xfffUL) +#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) +#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_SEV_CPUID_REQ 0x004UL +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ + (((unsigned long)reg & 3) << 30) | \ + (((unsigned long)fn) << 32)) + +#define GHCB_PROTOCOL_MAX 0x0001UL +#define GHCB_DEFAULT_USAGE 0x0000UL + +#define GHCB_SEV_CPUID_RESP 0x005UL +#define GHCB_SEV_TERMINATE 0x100UL +#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & 0x7) << 12) | \ + ((((u64)reason_val) & 0xff) << 16)) +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) +#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } + +enum es_result { + ES_OK, /* All good */ + ES_UNSUPPORTED, /* Requested operation not supported */ + ES_VMM_ERROR, /* Unexpected state from the VMM */ + ES_DECODE_FAILED, /* Instruction decoding failed */ + ES_EXCEPTION, /* Instruction caused exception */ + ES_RETRY, /* Retry instruction emulation */ +}; + +struct es_fault_info { + unsigned long vector; + unsigned long error_code; + unsigned long cr2; +}; + +struct pt_regs; + +/* ES instruction emulation context */ +struct es_em_ctxt { + struct pt_regs *regs; + struct insn insn; + struct es_fault_info fi; +}; + +void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); + +static inline u64 lower_bits(u64 val, unsigned int bits) +{ + u64 mask = (1ULL << bits) - 1; + + return (val & mask); +} + +struct real_mode_header; +enum stack_type; + +/* Early IDT entry points for #VC handler */ +extern void vc_no_ghcb(void); +extern void vc_boot_ghcb(void); +extern bool handle_vc_boot_ghcb(struct pt_regs *regs); + +#ifdef CONFIG_AMD_MEM_ENCRYPT +extern struct static_key_false sev_es_enable_key; +extern void __sev_es_ist_enter(struct pt_regs *regs); +extern void __sev_es_ist_exit(void); +static __always_inline void sev_es_ist_enter(struct pt_regs *regs) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_enter(regs); +} +static __always_inline void sev_es_ist_exit(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_ist_exit(); +} +extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh); +extern void __sev_es_nmi_complete(void); +static __always_inline void sev_es_nmi_complete(void) +{ + if (static_branch_unlikely(&sev_es_enable_key)) + __sev_es_nmi_complete(); +} +extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); +#else +static inline void sev_es_ist_enter(struct pt_regs *regs) { } +static inline void sev_es_ist_exit(void) { } +static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } +static inline void sev_es_nmi_complete(void) { } +static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +#endif + +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0704c2a94272..0f66682ac02a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg -CFLAGS_REMOVE_sev-es.o = -pg +CFLAGS_REMOVE_sev.o = -pg endif KASAN_SANITIZE_head$(BITS).o := n @@ -28,7 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -KASAN_SANITIZE_sev-es.o := n +KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o +obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 18be44163a50..de01903c3735 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include /* * Manage page tables very early on. diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2ef961cf4cfc..4bce802d25fb 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #define CREATE_TRACE_POINTS #include diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c deleted file mode 100644 index 0aa9f13efd57..000000000000 --- a/arch/x86/kernel/sev-es-shared.c +++ /dev/null @@ -1,525 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel - * - * This file is not compiled stand-alone. It contains code shared - * between the pre-decompression boot code and the running Linux kernel - * and is included directly into both code-bases. - */ - -#ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) -#endif - -static bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} - -static void __noreturn sev_es_terminate(unsigned int reason) -{ - u64 val = GHCB_SEV_TERMINATE; - - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERMINATE_REASON(0, reason); - - /* Request Guest Termination from Hypvervisor */ - sev_es_wr_ghcb_msr(val); - VMGEXIT(); - - while (true) - asm volatile("hlt\n" : : : "memory"); -} - -static bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_INFO(val) != GHCB_SEV_INFO) - return false; - - if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) - return false; - - return true; -} - -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - enum es_result ret; - - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v; - - info = ghcb->save.sw_exit_info_2; - v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - ret = ES_EXCEPTION; - } else { - ret = ES_VMM_ERROR; - } - } else { - ret = ES_OK; - } - - return ret; -} - -/* - * Boot VC Handler - This is the first VC handler during boot, there is no GHCB - * page yet, so it only supports the MSR based communication with the - * hypervisor and only the CPUID exit-code. - */ -void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) -{ - unsigned int fn = lower_bits(regs->ax, 32); - unsigned long val; - - /* Only CPUID is supported via MSR protocol */ - if (exit_code != SVM_EXIT_CPUID) - goto fail; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->ax = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->bx = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->cx = val >> 32; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) - goto fail; - regs->dx = val >> 32; - - /* - * This is a VC handler and the #VC is only raised when SEV-ES is - * active, which means SEV must be active too. Do sanity checks on the - * CPUID results to make sure the hypervisor does not trick the kernel - * into the no-sev path. This could map sensitive data unencrypted and - * make it accessible to the hypervisor. - * - * In particular, check for: - * - Availability of CPUID leaf 0x8000001f - * - SEV CPUID bit. - * - * The hypervisor might still report the wrong C-bit position, but this - * can't be checked here. - */ - - if (fn == 0x80000000 && (regs->ax < 0x8000001f)) - /* SEV leaf check */ - goto fail; - else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) - /* SEV bit */ - goto fail; - - /* Skip over the CPUID two-byte opcode */ - regs->ip += 2; - - return; - -fail: - /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - enum es_result ret = ES_OK; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - enum es_result ret = ES_OK; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u8)insn->immediate.value << 16; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u8)insn->immediate.value << 16; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; - break; - - default: - return ES_DECODE_FAILED; - } - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - } - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return ES_OK; -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c deleted file mode 100644 index 73873b007838..000000000000 --- a/arch/x86/kernel/sev-es.c +++ /dev/null @@ -1,1461 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * AMD Memory Encryption Support - * - * Copyright (C) 2019 SUSE - * - * Author: Joerg Roedel - */ - -#define pr_fmt(fmt) "SEV-ES: " fmt - -#include /* For show_regs() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DR7_RESET_VALUE 0x400 - -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb __initdata *boot_ghcb; - -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* Physical storage for the per-CPU IST stack of the #VC handler */ - char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - - /* - * Physical storage for the per-CPU fall-back stack of the #VC handler. - * The fall-back stack is used when it is not safe to switch back to the - * interrupted stack in the #VC entry code. - */ - char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); - -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - -static void __init setup_vc_stacks(int cpu) -{ - struct sev_es_runtime_data *data; - struct cpu_entry_area *cea; - unsigned long vaddr; - phys_addr_t pa; - - data = per_cpu(runtime_data, cpu); - cea = get_cpu_entry_area(cpu); - - /* Map #VC IST stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); - pa = __pa(data->ist_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); - - /* Map VC fall-back stack */ - vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); - pa = __pa(data->fallback_stack); - cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); -} - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} - -/* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. - */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} - -void noinstr __sev_es_ist_exit(void) -{ - unsigned long ist; - - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; - - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); -} - -static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) - return NULL; - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - data->ghcb_active = false; - } -} - -/* Needed in vc_early_forward_exception */ -void do_early_exception(struct pt_regs *regs, int trapnr); - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} - -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = (u32)(val); - high = (u32)(val >> 32); - - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} - -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); -} - -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res; - - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) - return ES_DECODE_FAILED; - - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; -} - -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res, ret; - - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; - char __user *target = (char __user *)dst; - u64 d8; - u32 d4; - u16 d2; - u8 d1; - - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(target, size)) { - memcpy(dst, buf, size); - return ES_OK; - } - - switch (size) { - case 1: - memcpy(&d1, buf, 1); - if (put_user(d1, target)) - goto fault; - break; - case 2: - memcpy(&d2, buf, 2); - if (put_user(d2, target)) - goto fault; - break; - case 4: - memcpy(&d4, buf, 4); - if (put_user(d4, target)) - goto fault; - break; - case 8: - memcpy(&d8, buf, 8); - if (put_user(d8, target)) - goto fault; - break; - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT; - char __user *s = (char __user *)src; - u64 d8; - u32 d4; - u16 d2; - u8 d1; - - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(s, size)) { - memcpy(buf, src, size); - return ES_OK; - } - - switch (size) { - case 1: - if (get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - case 2: - if (get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - case 4: - if (get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - case 8: - if (get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; - - return ES_EXCEPTION; -} - -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; - - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; - - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; - } - - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; -} - -/* Include code shared with pre-decompression boot stage */ -#include "sev-es-shared.c" - -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = sev_es_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); - - sev_es_put_ghcb(&state); -} - -static u64 get_jump_table_addr(void) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - local_irq_save(flags); - - ghcb = sev_es_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - sev_es_put_ghcb(&state); - - local_irq_restore(flags); - - return ret; -} - -int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) -{ - u16 startup_cs, startup_ip; - phys_addr_t jump_table_pa; - u64 jump_table_addr; - u16 __iomem *jump_table; - - jump_table_addr = get_jump_table_addr(); - - /* On UP guests there is no jump table so this is not a failure */ - if (!jump_table_addr) - return 0; - - /* Check if AP Jump Table is page-aligned */ - if (jump_table_addr & ~PAGE_MASK) - return -EINVAL; - - jump_table_pa = jump_table_addr & PAGE_MASK; - - startup_cs = (u16)(rmh->trampoline_start >> 4); - startup_ip = (u16)(rmh->sev_es_trampoline_start - - rmh->trampoline_start); - - jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); - if (!jump_table) - return -EIO; - - writew(startup_ip, &jump_table[0]); - writew(startup_cs, &jump_table[1]); - - iounmap(jump_table); - - return 0; -} - -/* - * This is needed by the OVMF UEFI firmware which will use whatever it finds in - * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu - * runtime GHCBs used by the kernel are also mapped in the EFI page-table. - */ -int __init sev_es_efi_map_ghcbs(pgd_t *pgd) -{ - struct sev_es_runtime_data *data; - unsigned long address, pflags; - int cpu; - u64 pfn; - - if (!sev_es_active()) - return 0; - - pflags = _PAGE_NX | _PAGE_RW; - - for_each_possible_cpu(cpu) { - data = per_cpu(runtime_data, cpu); - - address = __pa(&data->ghcb_page); - pfn = address >> PAGE_SHIFT; - - if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) - return 1; - } - - return 0; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - u64 exit_info_1; - - /* Is it a WRMSR? */ - exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; - - ghcb_set_rcx(ghcb, regs->cx); - if (exit_info_1) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); - - if ((ret == ES_OK) && (!exit_info_1)) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) -{ - /* First make sure the hypervisor talks a supported protocol. */ - if (!sev_es_negotiate_protocol()) - return false; - - /* - * Clear the boot_ghcb. The first exception comes in before the bss - * section is cleared. - */ - memset(&boot_ghcb_page, 0, PAGE_SIZE); - - /* Alright - Make the boot-ghcb public */ - boot_ghcb = &boot_ghcb_page; - - return true; -} - -#ifdef CONFIG_HOTPLUG_CPU -static void sev_es_ap_hlt_loop(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = sev_es_get_ghcb(&state); - - while (true) { - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - /* Wakeup signal? */ - if (ghcb_sw_exit_info_2_is_valid(ghcb) && - ghcb->save.sw_exit_info_2) - break; - } - - sev_es_put_ghcb(&state); -} - -/* - * Play_dead handler when running under SEV-ES. This is needed because - * the hypervisor can't deliver an SIPI request to restart the AP. - * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the - * hypervisor wakes it up again. - */ -static void sev_es_play_dead(void) -{ - play_dead_common(); - - /* IRQs now disabled */ - - sev_es_ap_hlt_loop(); - - /* - * If we get here, the VCPU was woken up again. Jump to CPU - * startup code to get it back online. - */ - start_cpu0(); -} -#else /* CONFIG_HOTPLUG_CPU */ -#define sev_es_play_dead native_play_dead -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_SMP -static void __init sev_es_setup_play_dead(void) -{ - smp_ops.play_dead = sev_es_play_dead; -} -#else -static inline void sev_es_setup_play_dead(void) { } -#endif - -static void __init alloc_runtime_data(int cpu) -{ - struct sev_es_runtime_data *data; - - data = memblock_alloc(sizeof(*data), PAGE_SIZE); - if (!data) - panic("Can't allocate SEV-ES runtime data"); - - per_cpu(runtime_data, cpu) = data; -} - -static void __init init_ghcb(int cpu) -{ - struct sev_es_runtime_data *data; - int err; - - data = per_cpu(runtime_data, cpu); - - err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, - sizeof(data->ghcb_page)); - if (err) - panic("Can't map GHCBs unencrypted"); - - memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); - - data->ghcb_active = false; - data->backup_ghcb_active = false; -} - -void __init sev_es_init_vc_handling(void) -{ - int cpu; - - BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - - if (!sev_es_active()) - return; - - if (!sev_es_check_cpu_features()) - panic("SEV-ES CPU Features missing"); - - /* Enable SEV-ES special handling */ - static_branch_enable(&sev_es_enable_key); - - /* Initialize per-cpu GHCB pages */ - for_each_possible_cpu(cpu) { - alloc_runtime_data(cpu); - init_ghcb(cpu); - setup_vc_stacks(cpu); - } - - sev_es_setup_play_dead(); - - /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; -} - -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - int sign_byte; - long *reg_data; - - switch (insn->opcode.bytes[1]) { - /* MMIO Read w/ zero-extension */ - case 0xb6: - bytes = 1; - fallthrough; - case 0xb7: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memset(reg_data, 0, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MMIO Read w/ sign-extension */ - case 0xbe: - bytes = 1; - fallthrough; - case 0xbf: - if (!bytes) - bytes = 2; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Sign extend based on operand size */ - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - memset(reg_data, sign_byte, insn->opnd_bytes); - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - default: - ret = ES_UNSUPPORTED; - } - - return ret; -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - unsigned int bytes = 0; - enum es_result ret; - long *reg_data; - - switch (insn->opcode.bytes[0]) { - /* MMIO Write */ - case 0x88: - bytes = 1; - fallthrough; - case 0x89: - if (!bytes) - bytes = insn->opnd_bytes; - - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - memcpy(ghcb->shared_buffer, reg_data, bytes); - - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - - case 0xc6: - bytes = 1; - fallthrough; - case 0xc7: - if (!bytes) - bytes = insn->opnd_bytes; - - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - - /* MMIO Read */ - case 0x8a: - bytes = 1; - fallthrough; - case 0x8b: - if (!bytes) - bytes = insn->opnd_bytes; - - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - reg_data = vc_insn_get_reg(ctxt); - if (!reg_data) - return ES_DECODE_FAILED; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - - /* MOVS instruction */ - case 0xa4: - bytes = 1; - fallthrough; - case 0xa5: - if (!bytes) - bytes = insn->opnd_bytes; - - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - /* Two-Byte Opcodes */ - case 0x0f: - ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); - break; - default: - ret = ES_UNSUPPORTED; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static __always_inline void vc_handle_trap_db(struct pt_regs *regs) -{ - if (user_mode(regs)) - noist_exc_debug(regs); - else - exc_debug(regs); -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; - - ctxt->regs->orig_ax = ctxt->fi.error_code; - - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); - } -} - -static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) -{ - unsigned long sp = (unsigned long)regs; - - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -/* - * Main #VC exception handler. It is called when the entry code was able to - * switch off the IST to a safe kernel stack. - * - * With the current implementation it is always possible to switch to a safe - * stack because #VC exceptions only happen at known places, like intercepted - * instructions or accesses to MMIO areas/IO ports. They can also happen with - * code instrumentation when the hypervisor intercepts #DB, but the critical - * paths are forbidden to be instrumented, so #DB exceptions currently also - * only happen in safe places. - */ -DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - irqentry_state_t irq_state; - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { - vc_handle_trap_db(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - lockdep_assert_irqs_disabled(); - instrumentation_begin(); - - /* - * This is invoked through an interrupt gate, so IRQs are disabled. The - * code below might walk page-tables for user or kernel addresses, so - * keep the IRQs disabled to protect us against concurrent TLB flushes. - */ - - ghcb = sev_es_get_ghcb(&state); - if (!ghcb) { - /* - * Mark GHCBs inactive so that panic() is able to print the - * message. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - } - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - sev_es_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - -out: - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); - - return; - -fail: - if (user_mode(regs)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } else { - pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", - result); - - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - goto out; -} - -/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ -DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) -{ - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); -} - -DEFINE_IDTENTRY_VC(exc_vmm_communication) -{ - if (likely(!on_vc_fallback_stack(regs))) - safe_stack_exc_vmm_communication(regs, error_code); - else - ist_exc_vmm_communication(regs, error_code); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - while (true) - halt(); -} diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c new file mode 100644 index 000000000000..0aa9f13efd57 --- /dev/null +++ b/arch/x86/kernel/sev-shared.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Encrypted Register State Support + * + * Author: Joerg Roedel + * + * This file is not compiled stand-alone. It contains code shared + * between the pre-decompression boot code and the running Linux kernel + * and is included directly into both code-bases. + */ + +#ifndef __BOOT_COMPRESSED +#define error(v) pr_err(v) +#define has_cpuflag(f) boot_cpu_has(f) +#endif + +static bool __init sev_es_check_cpu_features(void) +{ + if (!has_cpuflag(X86_FEATURE_RDRAND)) { + error("RDRAND instruction not supported - no trusted source of randomness available\n"); + return false; + } + + return true; +} + +static void __noreturn sev_es_terminate(unsigned int reason) +{ + u64 val = GHCB_SEV_TERMINATE; + + /* + * Tell the hypervisor what went wrong - only reason-set 0 is + * currently supported. + */ + val |= GHCB_SEV_TERMINATE_REASON(0, reason); + + /* Request Guest Termination from Hypvervisor */ + sev_es_wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +static bool sev_es_negotiate_protocol(void) +{ + u64 val; + + /* Do the GHCB protocol version negotiation */ + sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + + if (GHCB_INFO(val) != GHCB_SEV_INFO) + return false; + + if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + return false; + + return true; +} + +static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) +{ + memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); +} + +static bool vc_decoding_needed(unsigned long exit_code) +{ + /* Exceptions don't require to decode the instruction */ + return !(exit_code >= SVM_EXIT_EXCP_BASE && + exit_code <= SVM_EXIT_LAST_EXCP); +} + +static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, + struct pt_regs *regs, + unsigned long exit_code) +{ + enum es_result ret = ES_OK; + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->regs = regs; + + if (vc_decoding_needed(exit_code)) + ret = vc_decode_insn(ctxt); + + return ret; +} + +static void vc_finish_insn(struct es_em_ctxt *ctxt) +{ + ctxt->regs->ip += ctxt->insn.length; +} + +static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + u64 exit_code, u64 exit_info_1, + u64 exit_info_2) +{ + enum es_result ret; + + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) { + u64 info = ghcb->save.sw_exit_info_2; + unsigned long v; + + info = ghcb->save.sw_exit_info_2; + v = info & SVM_EVTINJ_VEC_MASK; + + /* Check if exception information from hypervisor is sane. */ + if ((info & SVM_EVTINJ_VALID) && + ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && + ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { + ctxt->fi.vector = v; + if (info & SVM_EVTINJ_VALID_ERR) + ctxt->fi.error_code = info >> 32; + ret = ES_EXCEPTION; + } else { + ret = ES_VMM_ERROR; + } + } else { + ret = ES_OK; + } + + return ret; +} + +/* + * Boot VC Handler - This is the first VC handler during boot, there is no GHCB + * page yet, so it only supports the MSR based communication with the + * hypervisor and only the CPUID exit-code. + */ +void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) +{ + unsigned int fn = lower_bits(regs->ax, 32); + unsigned long val; + + /* Only CPUID is supported via MSR protocol */ + if (exit_code != SVM_EXIT_CPUID) + goto fail; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->ax = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->bx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->cx = val >> 32; + + sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); + VMGEXIT(); + val = sev_es_rd_ghcb_msr(); + if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + goto fail; + regs->dx = val >> 32; + + /* + * This is a VC handler and the #VC is only raised when SEV-ES is + * active, which means SEV must be active too. Do sanity checks on the + * CPUID results to make sure the hypervisor does not trick the kernel + * into the no-sev path. This could map sensitive data unencrypted and + * make it accessible to the hypervisor. + * + * In particular, check for: + * - Availability of CPUID leaf 0x8000001f + * - SEV CPUID bit. + * + * The hypervisor might still report the wrong C-bit position, but this + * can't be checked here. + */ + + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + /* SEV leaf check */ + goto fail; + else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) + /* SEV bit */ + goto fail; + + /* Skip over the CPUID two-byte opcode */ + regs->ip += 2; + + return; + +fail: + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); +} + +static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, + void *src, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, b = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *s = src + (i * data_size * b); + char *d = buf + (i * data_size); + + ret = vc_read_mem(ctxt, s, d, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, + void *dst, char *buf, + unsigned int data_size, + unsigned int count, + bool backwards) +{ + int i, s = backwards ? -1 : 1; + enum es_result ret = ES_OK; + + for (i = 0; i < count; i++) { + void *d = dst + (i * data_size * s); + char *b = buf + (i * data_size); + + ret = vc_write_mem(ctxt, d, b, data_size); + if (ret != ES_OK) + break; + } + + return ret; +} + +#define IOIO_TYPE_STR BIT(2) +#define IOIO_TYPE_IN 1 +#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) +#define IOIO_TYPE_OUT 0 +#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) + +#define IOIO_REP BIT(3) + +#define IOIO_ADDR_64 BIT(9) +#define IOIO_ADDR_32 BIT(8) +#define IOIO_ADDR_16 BIT(7) + +#define IOIO_DATA_32 BIT(6) +#define IOIO_DATA_16 BIT(5) +#define IOIO_DATA_8 BIT(4) + +#define IOIO_SEG_ES (0 << 10) +#define IOIO_SEG_DS (3 << 10) + +static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) +{ + struct insn *insn = &ctxt->insn; + *exitinfo = 0; + + switch (insn->opcode.bytes[0]) { + /* INS opcodes */ + case 0x6c: + case 0x6d: + *exitinfo |= IOIO_TYPE_INS; + *exitinfo |= IOIO_SEG_ES; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUTS opcodes */ + case 0x6e: + case 0x6f: + *exitinfo |= IOIO_TYPE_OUTS; + *exitinfo |= IOIO_SEG_DS; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* IN immediate opcodes */ + case 0xe4: + case 0xe5: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* OUT immediate opcodes */ + case 0xe6: + case 0xe7: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (u8)insn->immediate.value << 16; + break; + + /* IN register opcodes */ + case 0xec: + case 0xed: + *exitinfo |= IOIO_TYPE_IN; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + /* OUT register opcodes */ + case 0xee: + case 0xef: + *exitinfo |= IOIO_TYPE_OUT; + *exitinfo |= (ctxt->regs->dx & 0xffff) << 16; + break; + + default: + return ES_DECODE_FAILED; + } + + switch (insn->opcode.bytes[0]) { + case 0x6c: + case 0x6e: + case 0xe4: + case 0xe6: + case 0xec: + case 0xee: + /* Single byte opcodes */ + *exitinfo |= IOIO_DATA_8; + break; + default: + /* Length determined by instruction parsing */ + *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 + : IOIO_DATA_32; + } + switch (insn->addr_bytes) { + case 2: + *exitinfo |= IOIO_ADDR_16; + break; + case 4: + *exitinfo |= IOIO_ADDR_32; + break; + case 8: + *exitinfo |= IOIO_ADDR_64; + break; + } + + if (insn_has_rep_prefix(insn)) + *exitinfo |= IOIO_REP; + + return ES_OK; +} + +static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u64 exit_info_1, exit_info_2; + enum es_result ret; + + ret = vc_ioio_exitinfo(ctxt, &exit_info_1); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_STR) { + + /* (REP) INS/OUTS */ + + bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); + unsigned int io_bytes, exit_bytes; + unsigned int ghcb_count, op_count; + unsigned long es_base; + u64 sw_scratch; + + /* + * For the string variants with rep prefix the amount of in/out + * operations per #VC exception is limited so that the kernel + * has a chance to take interrupts and re-schedule while the + * instruction is emulated. + */ + io_bytes = (exit_info_1 >> 4) & 0x7; + ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; + + op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; + exit_info_2 = min(op_count, ghcb_count); + exit_bytes = exit_info_2 * io_bytes; + + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + /* Read bytes of OUTS into the shared buffer */ + if (!(exit_info_1 & IOIO_TYPE_IN)) { + ret = vc_insn_string_read(ctxt, + (void *)(es_base + regs->si), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + } + + /* + * Issue an VMGEXIT to the HV to consume the bytes from the + * shared buffer or to have it write them into the shared buffer + * depending on the instruction: OUTS or INS. + */ + sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); + ghcb_set_sw_scratch(ghcb, sw_scratch); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, + exit_info_1, exit_info_2); + if (ret != ES_OK) + return ret; + + /* Read bytes from shared buffer into the guest's destination. */ + if (exit_info_1 & IOIO_TYPE_IN) { + ret = vc_insn_string_write(ctxt, + (void *)(es_base + regs->di), + ghcb->shared_buffer, io_bytes, + exit_info_2, df); + if (ret) + return ret; + + if (df) + regs->di -= exit_bytes; + else + regs->di += exit_bytes; + } else { + if (df) + regs->si -= exit_bytes; + else + regs->si += exit_bytes; + } + + if (exit_info_1 & IOIO_REP) + regs->cx -= exit_info_2; + + ret = regs->cx ? ES_RETRY : ES_OK; + + } else { + + /* IN/OUT into/from rAX */ + + int bits = (exit_info_1 & 0x70) >> 1; + u64 rax = 0; + + if (!(exit_info_1 & IOIO_TYPE_IN)) + rax = lower_bits(regs->ax, bits); + + ghcb_set_rax(ghcb, rax); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); + if (ret != ES_OK) + return ret; + + if (exit_info_1 & IOIO_TYPE_IN) { + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + regs->ax = lower_bits(ghcb->save.rax, bits); + } + } + + return ret; +} + +static enum es_result vc_handle_cpuid(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + u32 cr4 = native_read_cr4(); + enum es_result ret; + + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rcx(ghcb, regs->cx); + + if (cr4 & X86_CR4_OSXSAVE) + /* Safe to read xcr0 */ + ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); + else + /* xgetbv will cause #GP - use reset value for xcr0 */ + ghcb_set_xcr0(ghcb, 1); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && + ghcb_rbx_is_valid(ghcb) && + ghcb_rcx_is_valid(ghcb) && + ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + regs->ax = ghcb->save.rax; + regs->bx = ghcb->save.rbx; + regs->cx = ghcb->save.rcx; + regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, + struct es_em_ctxt *ctxt, + unsigned long exit_code) +{ + bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); + enum es_result ret; + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && + (!rdtscp || ghcb_rcx_is_valid(ghcb)))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + if (rdtscp) + ctxt->regs->cx = ghcb->save.rcx; + + return ES_OK; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c new file mode 100644 index 000000000000..9578c82832aa --- /dev/null +++ b/arch/x86/kernel/sev.c @@ -0,0 +1,1461 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Memory Encryption Support + * + * Copyright (C) 2019 SUSE + * + * Author: Joerg Roedel + */ + +#define pr_fmt(fmt) "SEV-ES: " fmt + +#include /* For show_regs() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DR7_RESET_VALUE 0x400 + +/* For early boot hypervisor communication in SEV-ES enabled guests */ +static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); + +/* + * Needs to be in the .data section because we need it NULL before bss is + * cleared + */ +static struct ghcb __initdata *boot_ghcb; + +/* #VC handler runtime per-CPU data */ +struct sev_es_runtime_data { + struct ghcb ghcb_page; + + /* Physical storage for the per-CPU IST stack of the #VC handler */ + char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Physical storage for the per-CPU fall-back stack of the #VC handler. + * The fall-back stack is used when it is not safe to switch back to the + * interrupted stack in the #VC entry code. + */ + char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE); + + /* + * Reserve one page per CPU as backup storage for the unencrypted GHCB. + * It is needed when an NMI happens while the #VC handler uses the real + * GHCB, and the NMI handler itself is causing another #VC exception. In + * that case the GHCB content of the first handler needs to be backed up + * and restored. + */ + struct ghcb backup_ghcb; + + /* + * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. + * There is no need for it to be atomic, because nothing is written to + * the GHCB between the read and the write of ghcb_active. So it is safe + * to use it when a nested #VC exception happens before the write. + * + * This is necessary for example in the #VC->NMI->#VC case when the NMI + * happens while the first #VC handler uses the GHCB. When the NMI code + * raises a second #VC handler it might overwrite the contents of the + * GHCB written by the first handler. To avoid this the content of the + * GHCB is saved and restored when the GHCB is detected to be in use + * already. + */ + bool ghcb_active; + bool backup_ghcb_active; + + /* + * Cached DR7 value - write it on DR7 writes and return it on reads. + * That value will never make it to the real hardware DR7 as debugging + * is currently unsupported in SEV-ES guests. + */ + unsigned long dr7; +}; + +struct ghcb_state { + struct ghcb *ghcb; +}; + +static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); +DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static void __init setup_vc_stacks(int cpu) +{ + struct sev_es_runtime_data *data; + struct cpu_entry_area *cea; + unsigned long vaddr; + phys_addr_t pa; + + data = per_cpu(runtime_data, cpu); + cea = get_cpu_entry_area(cpu); + + /* Map #VC IST stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC); + pa = __pa(data->ist_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); + + /* Map VC fall-back stack */ + vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2); + pa = __pa(data->fallback_stack); + cea_set_pte((void *)vaddr, pa, PAGE_KERNEL); +} + +static __always_inline bool on_vc_stack(struct pt_regs *regs) +{ + unsigned long sp = regs->sp; + + /* User-mode RSP is not trusted */ + if (user_mode(regs)) + return false; + + /* SYSCALL gap still has user-mode RSP */ + if (ip_within_syscall_gap(regs)) + return false; + + return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); +} + +/* + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. + * + * The IST entry is adjusted unconditionally so that it can be also be + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. + */ +void noinstr __sev_es_ist_enter(struct pt_regs *regs) +{ + unsigned long old_ist, new_ist; + + /* Read old IST entry */ + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ + if (on_vc_stack(regs)) + new_ist = regs->sp; + + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); + *(unsigned long *)new_ist = old_ist; + + /* Set new IST entry */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); +} + +void noinstr __sev_es_ist_exit(void) +{ + unsigned long ist; + + /* Read IST entry */ + ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + + if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) + return; + + /* Read back old IST entry and write it to the TSS */ + this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); +} + +static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (unlikely(data->ghcb_active)) { + /* GHCB is already in use - save its contents */ + + if (unlikely(data->backup_ghcb_active)) + return NULL; + + /* Mark backup_ghcb active before writing to it */ + data->backup_ghcb_active = true; + + state->ghcb = &data->backup_ghcb; + + /* Backup GHCB content */ + *state->ghcb = *ghcb; + } else { + state->ghcb = NULL; + data->ghcb_active = true; + } + + return ghcb; +} + +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + data->ghcb_active = false; + } +} + +/* Needed in vc_early_forward_exception */ +void do_early_exception(struct pt_regs *regs, int trapnr); + +static inline u64 sev_es_rd_ghcb_msr(void) +{ + return __rdmsr(MSR_AMD64_SEV_ES_GHCB); +} + +static __always_inline void sev_es_wr_ghcb_msr(u64 val) +{ + u32 low, high; + + low = (u32)(val); + high = (u32)(val >> 32); + + native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); +} + +static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, + unsigned char *buffer) +{ + return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); +} + +static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res; + + res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (!res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + return ES_DECODE_FAILED; + + if (ctxt->insn.immediate.got) + return ES_OK; + else + return ES_DECODE_FAILED; +} + +static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) +{ + char buffer[MAX_INSN_SIZE]; + int res, ret; + + res = vc_fetch_insn_kernel(ctxt, buffer); + if (res) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = X86_PF_INSTR; + ctxt->fi.cr2 = ctxt->regs->ip; + return ES_EXCEPTION; + } + + ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); + if (ret < 0) + return ES_DECODE_FAILED; + else + return ES_OK; +} + +static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) +{ + if (user_mode(ctxt->regs)) + return __vc_decode_user_insn(ctxt); + else + return __vc_decode_kern_insn(ctxt); +} + +static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, + char *dst, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; + char __user *target = (char __user *)dst; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(target, size)) { + memcpy(dst, buf, size); + return ES_OK; + } + + switch (size) { + case 1: + memcpy(&d1, buf, 1); + if (put_user(d1, target)) + goto fault; + break; + case 2: + memcpy(&d2, buf, 2); + if (put_user(d2, target)) + goto fault; + break; + case 4: + memcpy(&d4, buf, 4); + if (put_user(d4, target)) + goto fault; + break; + case 8: + memcpy(&d8, buf, 8); + if (put_user(d8, target)) + goto fault; + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)dst; + + return ES_EXCEPTION; +} + +static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, + char *src, char *buf, size_t size) +{ + unsigned long error_code = X86_PF_PROT; + char __user *s = (char __user *)src; + u64 d8; + u32 d4; + u16 d2; + u8 d1; + + /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ + if (!user_mode(ctxt->regs) && !access_ok(s, size)) { + memcpy(buf, src, size); + return ES_OK; + } + + switch (size) { + case 1: + if (get_user(d1, s)) + goto fault; + memcpy(buf, &d1, 1); + break; + case 2: + if (get_user(d2, s)) + goto fault; + memcpy(buf, &d2, 2); + break; + case 4: + if (get_user(d4, s)) + goto fault; + memcpy(buf, &d4, 4); + break; + case 8: + if (get_user(d8, s)) + goto fault; + memcpy(buf, &d8, 8); + break; + default: + WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); + return ES_UNSUPPORTED; + } + + return ES_OK; + +fault: + if (user_mode(ctxt->regs)) + error_code |= X86_PF_USER; + + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.error_code = error_code; + ctxt->fi.cr2 = (unsigned long)src; + + return ES_EXCEPTION; +} + +static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned long vaddr, phys_addr_t *paddr) +{ + unsigned long va = (unsigned long)vaddr; + unsigned int level; + phys_addr_t pa; + pgd_t *pgd; + pte_t *pte; + + pgd = __va(read_cr3_pa()); + pgd = &pgd[pgd_index(va)]; + pte = lookup_address_in_pgd(pgd, va, &level); + if (!pte) { + ctxt->fi.vector = X86_TRAP_PF; + ctxt->fi.cr2 = vaddr; + ctxt->fi.error_code = 0; + + if (user_mode(ctxt->regs)) + ctxt->fi.error_code |= X86_PF_USER; + + return ES_EXCEPTION; + } + + if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) + /* Emulated MMIO to/from encrypted memory not supported */ + return ES_UNSUPPORTED; + + pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; + pa |= va & ~page_level_mask(level); + + *paddr = pa; + + return ES_OK; +} + +/* Include code shared with pre-decompression boot stage */ +#include "sev-shared.c" + +void noinstr __sev_es_nmi_complete(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); + VMGEXIT(); + + sev_es_put_ghcb(&state); +} + +static u64 get_jump_table_addr(void) +{ + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u64 ret = 0; + + local_irq_save(flags); + + ghcb = sev_es_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (ghcb_sw_exit_info_1_is_valid(ghcb) && + ghcb_sw_exit_info_2_is_valid(ghcb)) + ret = ghcb->save.sw_exit_info_2; + + sev_es_put_ghcb(&state); + + local_irq_restore(flags); + + return ret; +} + +int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) +{ + u16 startup_cs, startup_ip; + phys_addr_t jump_table_pa; + u64 jump_table_addr; + u16 __iomem *jump_table; + + jump_table_addr = get_jump_table_addr(); + + /* On UP guests there is no jump table so this is not a failure */ + if (!jump_table_addr) + return 0; + + /* Check if AP Jump Table is page-aligned */ + if (jump_table_addr & ~PAGE_MASK) + return -EINVAL; + + jump_table_pa = jump_table_addr & PAGE_MASK; + + startup_cs = (u16)(rmh->trampoline_start >> 4); + startup_ip = (u16)(rmh->sev_es_trampoline_start - + rmh->trampoline_start); + + jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); + if (!jump_table) + return -EIO; + + writew(startup_ip, &jump_table[0]); + writew(startup_cs, &jump_table[1]); + + iounmap(jump_table); + + return 0; +} + +/* + * This is needed by the OVMF UEFI firmware which will use whatever it finds in + * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu + * runtime GHCBs used by the kernel are also mapped in the EFI page-table. + */ +int __init sev_es_efi_map_ghcbs(pgd_t *pgd) +{ + struct sev_es_runtime_data *data; + unsigned long address, pflags; + int cpu; + u64 pfn; + + if (!sev_es_active()) + return 0; + + pflags = _PAGE_NX | _PAGE_RW; + + for_each_possible_cpu(cpu) { + data = per_cpu(runtime_data, cpu); + + address = __pa(&data->ghcb_page); + pfn = address >> PAGE_SHIFT; + + if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) + return 1; + } + + return 0; +} + +static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + struct pt_regs *regs = ctxt->regs; + enum es_result ret; + u64 exit_info_1; + + /* Is it a WRMSR? */ + exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; + + ghcb_set_rcx(ghcb, regs->cx); + if (exit_info_1) { + ghcb_set_rax(ghcb, regs->ax); + ghcb_set_rdx(ghcb, regs->dx); + } + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); + + if ((ret == ES_OK) && (!exit_info_1)) { + regs->ax = ghcb->save.rax; + regs->dx = ghcb->save.rdx; + } + + return ret; +} + +/* + * This function runs on the first #VC exception after the kernel + * switched to virtual addresses. + */ +static bool __init sev_es_setup_ghcb(void) +{ + /* First make sure the hypervisor talks a supported protocol. */ + if (!sev_es_negotiate_protocol()) + return false; + + /* + * Clear the boot_ghcb. The first exception comes in before the bss + * section is cleared. + */ + memset(&boot_ghcb_page, 0, PAGE_SIZE); + + /* Alright - Make the boot-ghcb public */ + boot_ghcb = &boot_ghcb_page; + + return true; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void sev_es_ap_hlt_loop(void) +{ + struct ghcb_state state; + struct ghcb *ghcb; + + ghcb = sev_es_get_ghcb(&state); + + while (true) { + vc_ghcb_invalidate(ghcb); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); + ghcb_set_sw_exit_info_1(ghcb, 0); + ghcb_set_sw_exit_info_2(ghcb, 0); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + /* Wakeup signal? */ + if (ghcb_sw_exit_info_2_is_valid(ghcb) && + ghcb->save.sw_exit_info_2) + break; + } + + sev_es_put_ghcb(&state); +} + +/* + * Play_dead handler when running under SEV-ES. This is needed because + * the hypervisor can't deliver an SIPI request to restart the AP. + * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the + * hypervisor wakes it up again. + */ +static void sev_es_play_dead(void) +{ + play_dead_common(); + + /* IRQs now disabled */ + + sev_es_ap_hlt_loop(); + + /* + * If we get here, the VCPU was woken up again. Jump to CPU + * startup code to get it back online. + */ + start_cpu0(); +} +#else /* CONFIG_HOTPLUG_CPU */ +#define sev_es_play_dead native_play_dead +#endif /* CONFIG_HOTPLUG_CPU */ + +#ifdef CONFIG_SMP +static void __init sev_es_setup_play_dead(void) +{ + smp_ops.play_dead = sev_es_play_dead; +} +#else +static inline void sev_es_setup_play_dead(void) { } +#endif + +static void __init alloc_runtime_data(int cpu) +{ + struct sev_es_runtime_data *data; + + data = memblock_alloc(sizeof(*data), PAGE_SIZE); + if (!data) + panic("Can't allocate SEV-ES runtime data"); + + per_cpu(runtime_data, cpu) = data; +} + +static void __init init_ghcb(int cpu) +{ + struct sev_es_runtime_data *data; + int err; + + data = per_cpu(runtime_data, cpu); + + err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, + sizeof(data->ghcb_page)); + if (err) + panic("Can't map GHCBs unencrypted"); + + memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); + + data->ghcb_active = false; + data->backup_ghcb_active = false; +} + +void __init sev_es_init_vc_handling(void) +{ + int cpu; + + BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); + + if (!sev_es_active()) + return; + + if (!sev_es_check_cpu_features()) + panic("SEV-ES CPU Features missing"); + + /* Enable SEV-ES special handling */ + static_branch_enable(&sev_es_enable_key); + + /* Initialize per-cpu GHCB pages */ + for_each_possible_cpu(cpu) { + alloc_runtime_data(cpu); + init_ghcb(cpu); + setup_vc_stacks(cpu); + } + + sev_es_setup_play_dead(); + + /* Secondary CPUs use the runtime #VC handler */ + initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; +} + +static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) +{ + int trapnr = ctxt->fi.vector; + + if (trapnr == X86_TRAP_PF) + native_write_cr2(ctxt->fi.cr2); + + ctxt->regs->orig_ax = ctxt->fi.error_code; + do_early_exception(ctxt->regs, trapnr); +} + +static long *vc_insn_get_reg(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} + +static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) +{ + long *reg_array; + int offset; + + reg_array = (long *)ctxt->regs; + offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); + + if (offset < 0) + return NULL; + + offset /= sizeof(long); + + return reg_array + offset; +} +static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + unsigned int bytes, bool read) +{ + u64 exit_code, exit_info_1, exit_info_2; + unsigned long ghcb_pa = __pa(ghcb); + enum es_result res; + phys_addr_t paddr; + void __user *ref; + + ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); + if (ref == (void __user *)-1L) + return ES_UNSUPPORTED; + + exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; + + res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); + if (res != ES_OK) { + if (res == ES_EXCEPTION && !read) + ctxt->fi.error_code |= X86_PF_WRITE; + + return res; + } + + exit_info_1 = paddr; + /* Can never be greater than 8 */ + exit_info_2 = bytes; + + ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); + + return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); +} + +static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + int sign_byte; + long *reg_data; + + switch (insn->opcode.bytes[1]) { + /* MMIO Read w/ zero-extension */ + case 0xb6: + bytes = 1; + fallthrough; + case 0xb7: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Zero extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memset(reg_data, 0, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MMIO Read w/ sign-extension */ + case 0xbe: + bytes = 1; + fallthrough; + case 0xbf: + if (!bytes) + bytes = 2; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + /* Sign extend based on operand size */ + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + if (bytes == 1) { + u8 *val = (u8 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x80) ? 0xff : 0x00; + } else { + u16 *val = (u16 *)ghcb->shared_buffer; + + sign_byte = (*val & 0x8000) ? 0xff : 0x00; + } + memset(reg_data, sign_byte, insn->opnd_bytes); + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +/* + * The MOVS instruction has two memory operands, which raises the + * problem that it is not known whether the access to the source or the + * destination caused the #VC exception (and hence whether an MMIO read + * or write operation needs to be emulated). + * + * Instead of playing games with walking page-tables and trying to guess + * whether the source or destination is an MMIO range, split the move + * into two operations, a read and a write with only one memory operand. + * This will cause a nested #VC exception on the MMIO address which can + * then be handled. + * + * This implementation has the benefit that it also supports MOVS where + * source _and_ destination are MMIO regions. + * + * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a + * rare operation. If it turns out to be a performance problem the split + * operations can be moved to memcpy_fromio() and memcpy_toio(). + */ +static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, + unsigned int bytes) +{ + unsigned long ds_base, es_base; + unsigned char *src, *dst; + unsigned char buffer[8]; + enum es_result ret; + bool rep; + int off; + + ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); + es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); + + if (ds_base == -1L || es_base == -1L) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + src = ds_base + (unsigned char *)ctxt->regs->si; + dst = es_base + (unsigned char *)ctxt->regs->di; + + ret = vc_read_mem(ctxt, src, buffer, bytes); + if (ret != ES_OK) + return ret; + + ret = vc_write_mem(ctxt, dst, buffer, bytes); + if (ret != ES_OK) + return ret; + + if (ctxt->regs->flags & X86_EFLAGS_DF) + off = -bytes; + else + off = bytes; + + ctxt->regs->si += off; + ctxt->regs->di += off; + + rep = insn_has_rep_prefix(&ctxt->insn); + if (rep) + ctxt->regs->cx -= 1; + + if (!rep || ctxt->regs->cx == 0) + return ES_OK; + else + return ES_RETRY; +} + +static enum es_result vc_handle_mmio(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct insn *insn = &ctxt->insn; + unsigned int bytes = 0; + enum es_result ret; + long *reg_data; + + switch (insn->opcode.bytes[0]) { + /* MMIO Write */ + case 0x88: + bytes = 1; + fallthrough; + case 0x89: + if (!bytes) + bytes = insn->opnd_bytes; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + memcpy(ghcb->shared_buffer, reg_data, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + case 0xc6: + bytes = 1; + fallthrough; + case 0xc7: + if (!bytes) + bytes = insn->opnd_bytes; + + memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); + + ret = vc_do_mmio(ghcb, ctxt, bytes, false); + break; + + /* MMIO Read */ + case 0x8a: + bytes = 1; + fallthrough; + case 0x8b: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_do_mmio(ghcb, ctxt, bytes, true); + if (ret) + break; + + reg_data = vc_insn_get_reg(ctxt); + if (!reg_data) + return ES_DECODE_FAILED; + + /* Zero-extend for 32-bit operation */ + if (bytes == 4) + *reg_data = 0; + + memcpy(reg_data, ghcb->shared_buffer, bytes); + break; + + /* MOVS instruction */ + case 0xa4: + bytes = 1; + fallthrough; + case 0xa5: + if (!bytes) + bytes = insn->opnd_bytes; + + ret = vc_handle_mmio_movs(ctxt, bytes); + break; + /* Two-Byte Opcodes */ + case 0x0f: + ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt); + break; + default: + ret = ES_UNSUPPORTED; + } + + return ret; +} + +static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long val, *reg = vc_insn_get_rm(ctxt); + enum es_result ret; + + if (!reg) + return ES_DECODE_FAILED; + + val = *reg; + + /* Upper 32 bits must be written as zeroes */ + if (val >> 32) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* Clear out other reserved bits and set bit 10 */ + val = (val & 0xffff23ffL) | BIT(10); + + /* Early non-zero writes to DR7 are not supported */ + if (!data && (val & ~DR7_RESET_VALUE)) + return ES_UNSUPPORTED; + + /* Using a value of 0 for ExitInfo1 means RAX holds the value */ + ghcb_set_rax(ghcb, val); + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); + if (ret != ES_OK) + return ret; + + if (data) + data->dr7 = val; + + return ES_OK; +} + +static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + long *reg = vc_insn_get_rm(ctxt); + + if (!reg) + return ES_DECODE_FAILED; + + if (data) + *reg = data->dr7; + else + *reg = DR7_RESET_VALUE; + + return ES_OK; +} + +static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); +} + +static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rcx(ghcb, ctxt->regs->cx); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); + if (ret != ES_OK) + return ret; + + if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + ctxt->regs->dx = ghcb->save.rdx; + + return ES_OK; +} + +static enum es_result vc_handle_monitor(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Treat it as a NOP and do not leak a physical address to the + * hypervisor. + */ + return ES_OK; +} + +static enum es_result vc_handle_mwait(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* Treat the same as MONITOR/MONITORX */ + return ES_OK; +} + +static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + enum es_result ret; + + ghcb_set_rax(ghcb, ctxt->regs->ax); + ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); + + if (x86_platform.hyper.sev_es_hcall_prepare) + x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); + + ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); + if (ret != ES_OK) + return ret; + + if (!ghcb_rax_is_valid(ghcb)) + return ES_VMM_ERROR; + + ctxt->regs->ax = ghcb->save.rax; + + /* + * Call sev_es_hcall_finish() after regs->ax is already set. + * This allows the hypervisor handler to overwrite it again if + * necessary. + */ + if (x86_platform.hyper.sev_es_hcall_finish && + !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) + return ES_VMM_ERROR; + + return ES_OK; +} + +static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, + struct es_em_ctxt *ctxt) +{ + /* + * Calling ecx_alignment_check() directly does not work, because it + * enables IRQs and the GHCB is active. Forward the exception and call + * it later from vc_forward_exception(). + */ + ctxt->fi.vector = X86_TRAP_AC; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; +} + +static __always_inline void vc_handle_trap_db(struct pt_regs *regs) +{ + if (user_mode(regs)) + noist_exc_debug(regs); + else + exc_debug(regs); +} + +static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, + struct ghcb *ghcb, + unsigned long exit_code) +{ + enum es_result result; + + switch (exit_code) { + case SVM_EXIT_READ_DR7: + result = vc_handle_dr7_read(ghcb, ctxt); + break; + case SVM_EXIT_WRITE_DR7: + result = vc_handle_dr7_write(ghcb, ctxt); + break; + case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: + result = vc_handle_trap_ac(ghcb, ctxt); + break; + case SVM_EXIT_RDTSC: + case SVM_EXIT_RDTSCP: + result = vc_handle_rdtsc(ghcb, ctxt, exit_code); + break; + case SVM_EXIT_RDPMC: + result = vc_handle_rdpmc(ghcb, ctxt); + break; + case SVM_EXIT_INVD: + pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); + result = ES_UNSUPPORTED; + break; + case SVM_EXIT_CPUID: + result = vc_handle_cpuid(ghcb, ctxt); + break; + case SVM_EXIT_IOIO: + result = vc_handle_ioio(ghcb, ctxt); + break; + case SVM_EXIT_MSR: + result = vc_handle_msr(ghcb, ctxt); + break; + case SVM_EXIT_VMMCALL: + result = vc_handle_vmmcall(ghcb, ctxt); + break; + case SVM_EXIT_WBINVD: + result = vc_handle_wbinvd(ghcb, ctxt); + break; + case SVM_EXIT_MONITOR: + result = vc_handle_monitor(ghcb, ctxt); + break; + case SVM_EXIT_MWAIT: + result = vc_handle_mwait(ghcb, ctxt); + break; + case SVM_EXIT_NPF: + result = vc_handle_mmio(ghcb, ctxt); + break; + default: + /* + * Unexpected #VC exception + */ + result = ES_UNSUPPORTED; + } + + return result; +} + +static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) +{ + long error_code = ctxt->fi.error_code; + int trapnr = ctxt->fi.vector; + + ctxt->regs->orig_ax = ctxt->fi.error_code; + + switch (trapnr) { + case X86_TRAP_GP: + exc_general_protection(ctxt->regs, error_code); + break; + case X86_TRAP_UD: + exc_invalid_op(ctxt->regs); + break; + case X86_TRAP_AC: + exc_alignment_check(ctxt->regs, error_code); + break; + default: + pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); + BUG(); + } +} + +static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) +{ + unsigned long sp = (unsigned long)regs; + + return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); +} + +/* + * Main #VC exception handler. It is called when the entry code was able to + * switch off the IST to a safe kernel stack. + * + * With the current implementation it is always possible to switch to a safe + * stack because #VC exceptions only happen at known places, like intercepted + * instructions or accesses to MMIO areas/IO ports. They can also happen with + * code instrumentation when the hypervisor intercepts #DB, but the critical + * paths are forbidden to be instrumented, so #DB exceptions currently also + * only happen in safe places. + */ +DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +{ + struct sev_es_runtime_data *data = this_cpu_read(runtime_data); + irqentry_state_t irq_state; + struct ghcb_state state; + struct es_em_ctxt ctxt; + enum es_result result; + struct ghcb *ghcb; + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { + vc_handle_trap_db(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + lockdep_assert_irqs_disabled(); + instrumentation_begin(); + + /* + * This is invoked through an interrupt gate, so IRQs are disabled. The + * code below might walk page-tables for user or kernel addresses, so + * keep the IRQs disabled to protect us against concurrent TLB flushes. + */ + + ghcb = sev_es_get_ghcb(&state); + if (!ghcb) { + /* + * Mark GHCBs inactive so that panic() is able to print the + * message. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } + + vc_ghcb_invalidate(ghcb); + result = vc_init_em_ctxt(&ctxt, regs, error_code); + + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, ghcb, error_code); + + sev_es_put_ghcb(&state); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + error_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + pr_emerg("Unknown result in %s():%d\n", __func__, result); + /* + * Emulating the instruction which caused the #VC exception + * failed - can't continue so print debug information + */ + BUG(); + } + +out: + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); + + return; + +fail: + if (user_mode(regs)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } else { + pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", + result); + + /* Show some debug info */ + show_regs(regs); + + /* Ask hypervisor to sev_es_terminate */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + /* If that fails and we get here - just panic */ + panic("Returned from Terminate-Request to Hypervisor\n"); + } + + goto out; +} + +/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ +DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +{ + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); +} + +DEFINE_IDTENTRY_VC(exc_vmm_communication) +{ + if (likely(!on_vc_fallback_stack(regs))) + safe_stack_exc_vmm_communication(regs, error_code); + else + ist_exc_vmm_communication(regs, error_code); +} + +bool __init handle_vc_boot_ghcb(struct pt_regs *regs) +{ + unsigned long exit_code = regs->orig_ax; + struct es_em_ctxt ctxt; + enum es_result result; + + /* Do initial setup or terminate the guest */ + if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); + + vc_ghcb_invalidate(boot_ghcb); + + result = vc_init_em_ctxt(&ctxt, regs, exit_code); + if (result == ES_OK) + result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); + + /* Done - now check the result */ + switch (result) { + case ES_OK: + vc_finish_insn(&ctxt); + break; + case ES_UNSUPPORTED: + early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_VMM_ERROR: + early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_DECODE_FAILED: + early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", + exit_code, regs->ip); + goto fail; + case ES_EXCEPTION: + vc_early_forward_exception(&ctxt); + break; + case ES_RETRY: + /* Nothing to do */ + break; + default: + BUG(); + } + + return true; + +fail: + show_regs(regs); + + while (true) + halt(); +} diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index b93d6cd08a7f..121921b2927c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index df7b5477fc4f..7515e78ef898 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include /* * We allocate runtime services regions top-down, starting from -4G, i.e. diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 1be71ef5e4c4..2e1c1bec0f9e 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; -- cgit v1.2.3 From b81fc74d53d1248de6db3136dd6b29e5d5528021 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:35 -0500 Subject: x86/sev: Move GHCB MSR protocol and NAE definitions in a common header The guest and the hypervisor contain separate macros to get and set the GHCB MSR protocol and NAE event fields. Consolidate the GHCB protocol definitions and helper macros in one place. Leave the supported protocol version define in separate files to keep the guest and hypervisor flexibility to support different GHCB version in the same release. There is no functional change intended. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-3-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 62 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/sev.h | 30 +++---------------- arch/x86/kernel/sev-shared.c | 20 ++++++------- arch/x86/kvm/svm/svm.h | 38 +++--------------------- 4 files changed, 80 insertions(+), 70 deletions(-) create mode 100644 arch/x86/include/asm/sev-common.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h new file mode 100644 index 000000000000..629c3df243f0 --- /dev/null +++ b/arch/x86/include/asm/sev-common.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD SEV header common between the guest and the hypervisor. + * + * Author: Brijesh Singh + */ + +#ifndef __ASM_X86_SEV_COMMON_H +#define __ASM_X86_SEV_COMMON_H + +#define GHCB_MSR_INFO_POS 0 +#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) + +#define GHCB_MSR_SEV_INFO_RESP 0x001 +#define GHCB_MSR_SEV_INFO_REQ 0x002 +#define GHCB_MSR_VER_MAX_POS 48 +#define GHCB_MSR_VER_MAX_MASK 0xffff +#define GHCB_MSR_VER_MIN_POS 32 +#define GHCB_MSR_VER_MIN_MASK 0xffff +#define GHCB_MSR_CBIT_POS 24 +#define GHCB_MSR_CBIT_MASK 0xff +#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ + ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ + (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ + (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ + GHCB_MSR_SEV_INFO_RESP) +#define GHCB_MSR_INFO(v) ((v) & 0xfffUL) +#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) +#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) + +#define GHCB_MSR_CPUID_REQ 0x004 +#define GHCB_MSR_CPUID_RESP 0x005 +#define GHCB_MSR_CPUID_FUNC_POS 32 +#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff +#define GHCB_MSR_CPUID_VALUE_POS 32 +#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff +#define GHCB_MSR_CPUID_REG_POS 30 +#define GHCB_MSR_CPUID_REG_MASK 0x3 +#define GHCB_CPUID_REQ_EAX 0 +#define GHCB_CPUID_REQ_EBX 1 +#define GHCB_CPUID_REQ_ECX 2 +#define GHCB_CPUID_REQ_EDX 3 +#define GHCB_CPUID_REQ(fn, reg) \ + (GHCB_MSR_CPUID_REQ | \ + (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ + (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) + +#define GHCB_MSR_TERM_REQ 0x100 +#define GHCB_MSR_TERM_REASON_SET_POS 12 +#define GHCB_MSR_TERM_REASON_SET_MASK 0xf +#define GHCB_MSR_TERM_REASON_POS 16 +#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \ + (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \ + ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS)) + +#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 +#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 + +#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) + +#endif diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index cf1d957c7091..fa5cd05d3b5b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -10,34 +10,12 @@ #include #include +#include -#define GHCB_SEV_INFO 0x001UL -#define GHCB_SEV_INFO_REQ 0x002UL -#define GHCB_INFO(v) ((v) & 0xfffUL) -#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL) -#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL) -#define GHCB_PROTO_OUR 0x0001UL -#define GHCB_SEV_CPUID_REQ 0x004UL -#define GHCB_CPUID_REQ_EAX 0 -#define GHCB_CPUID_REQ_EBX 1 -#define GHCB_CPUID_REQ_ECX 2 -#define GHCB_CPUID_REQ_EDX 3 -#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \ - (((unsigned long)reg & 3) << 30) | \ - (((unsigned long)fn) << 32)) +#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_DEFAULT_USAGE 0ULL -#define GHCB_PROTOCOL_MAX 0x0001UL -#define GHCB_DEFAULT_USAGE 0x0000UL - -#define GHCB_SEV_CPUID_RESP 0x005UL -#define GHCB_SEV_TERMINATE 0x100UL -#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \ - (((((u64)reason_set) & 0x7) << 12) | \ - ((((u64)reason_val) & 0xff) << 16)) -#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0 -#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1 - -#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff) #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } enum es_result { diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 0aa9f13efd57..6ec8b3bfd76e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -26,13 +26,13 @@ static bool __init sev_es_check_cpu_features(void) static void __noreturn sev_es_terminate(unsigned int reason) { - u64 val = GHCB_SEV_TERMINATE; + u64 val = GHCB_MSR_TERM_REQ; /* * Tell the hypervisor what went wrong - only reason-set 0 is * currently supported. */ - val |= GHCB_SEV_TERMINATE_REASON(0, reason); + val |= GHCB_SEV_TERM_REASON(0, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -47,15 +47,15 @@ static bool sev_es_negotiate_protocol(void) u64 val; /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ); + sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_INFO(val) != GHCB_SEV_INFO) + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) return false; return true; @@ -153,28 +153,28 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->ax = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->bx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->cx = val >> 32; sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX)); VMGEXIT(); val = sev_es_rd_ghcb_msr(); - if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP) + if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) goto fail; regs->dx = val >> 32; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 84b3133c2251..42f8a7b9048f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -20,6 +20,7 @@ #include #include +#include #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) @@ -525,40 +526,9 @@ void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -#define GHCB_VERSION_MAX 1ULL -#define GHCB_VERSION_MIN 1ULL - -#define GHCB_MSR_INFO_POS 0 -#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) - -#define GHCB_MSR_SEV_INFO_RESP 0x001 -#define GHCB_MSR_SEV_INFO_REQ 0x002 -#define GHCB_MSR_VER_MAX_POS 48 -#define GHCB_MSR_VER_MAX_MASK 0xffff -#define GHCB_MSR_VER_MIN_POS 32 -#define GHCB_MSR_VER_MIN_MASK 0xffff -#define GHCB_MSR_CBIT_POS 24 -#define GHCB_MSR_CBIT_MASK 0xff -#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \ - ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \ - (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \ - (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \ - GHCB_MSR_SEV_INFO_RESP) - -#define GHCB_MSR_CPUID_REQ 0x004 -#define GHCB_MSR_CPUID_RESP 0x005 -#define GHCB_MSR_CPUID_FUNC_POS 32 -#define GHCB_MSR_CPUID_FUNC_MASK 0xffffffff -#define GHCB_MSR_CPUID_VALUE_POS 32 -#define GHCB_MSR_CPUID_VALUE_MASK 0xffffffff -#define GHCB_MSR_CPUID_REG_POS 30 -#define GHCB_MSR_CPUID_REG_MASK 0x3 - -#define GHCB_MSR_TERM_REQ 0x100 -#define GHCB_MSR_TERM_REASON_SET_POS 12 -#define GHCB_MSR_TERM_REASON_SET_MASK 0xf -#define GHCB_MSR_TERM_REASON_POS 16 -#define GHCB_MSR_TERM_REASON_MASK 0xff +#define GHCB_VERSION_MAX 1ULL +#define GHCB_VERSION_MIN 1ULL + extern unsigned int max_sev_asid; -- cgit v1.2.3 From 059e5c321a65657877924256ea8ad9c0df257b45 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Tue, 27 Apr 2021 06:16:36 -0500 Subject: x86/msr: Rename MSR_K8_SYSCFG to MSR_AMD64_SYSCFG The SYSCFG MSR continued being updated beyond the K8 family; drop the K8 name from it. Suggested-by: Borislav Petkov Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lkml.kernel.org/r/20210427111636.1207-4-brijesh.singh@amd.com --- Documentation/virt/kvm/amd-memory-encryption.rst | 2 +- Documentation/x86/amd-memory-encryption.rst | 6 +++--- arch/x86/include/asm/msr-index.h | 6 +++--- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/x86.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 6 +++--- arch/x86/pci/amd_bus.c | 2 +- arch/x86/realmode/rm/trampoline_64.S | 4 ++-- drivers/edac/amd64_edac.c | 2 +- tools/arch/x86/include/asm/msr-index.h | 6 +++--- 14 files changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst index 5ec8a1902e15..5c081c8c7164 100644 --- a/Documentation/virt/kvm/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/amd-memory-encryption.rst @@ -22,7 +22,7 @@ to SEV:: [ecx]: Bits[31:0] Number of encrypted guests supported simultaneously -If support for SEV is present, MSR 0xc001_0010 (MSR_K8_SYSCFG) and MSR 0xc001_0015 +If support for SEV is present, MSR 0xc001_0010 (MSR_AMD64_SYSCFG) and MSR 0xc001_0015 (MSR_K7_HWCR) can be used to determine if it can be enabled:: 0xc001_0010: diff --git a/Documentation/x86/amd-memory-encryption.rst b/Documentation/x86/amd-memory-encryption.rst index c48d452d0718..a1940ebe7be5 100644 --- a/Documentation/x86/amd-memory-encryption.rst +++ b/Documentation/x86/amd-memory-encryption.rst @@ -53,7 +53,7 @@ CPUID function 0x8000001f reports information related to SME:: system physical addresses, not guest physical addresses) -If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to +If support for SME is present, MSR 0xc00100010 (MSR_AMD64_SYSCFG) can be used to determine if SME is enabled and/or to enable memory encryption:: 0xc0010010: @@ -79,7 +79,7 @@ The state of SME in the Linux kernel can be documented as follows: The CPU supports SME (determined through CPUID instruction). - Enabled: - Supported and bit 23 of MSR_K8_SYSCFG is set. + Supported and bit 23 of MSR_AMD64_SYSCFG is set. - Active: Supported, Enabled and the Linux kernel is actively applying @@ -89,7 +89,7 @@ The state of SME in the Linux kernel can be documented as follows: SME can also be enabled and activated in the BIOS. If SME is enabled and activated in the BIOS, then all memory accesses will be encrypted and it will not be necessary to activate the Linux memory encryption support. If the BIOS -merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate +merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or by supplying mem_encrypt=on on the kernel command line. However, if BIOS does not enable SME, then Linux will not be able to activate memory encryption, even diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 742d89a00721..211ba3375ee9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -537,9 +537,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..0adb0341cd7c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -593,8 +593,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; /* diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 0c3b372318b7..b5f43049fa5f 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -836,7 +836,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ - if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0) return 0; /* * Memory between 4GB and top of mem is forced WB by this magic bit. diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index b90f3f437765..558108296f3c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -53,13 +53,13 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - rdmsr(MSR_K8_SYSCFG, lo, hi); + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" " not cleared by BIOS, clearing this bit\n", smp_processor_id()); lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; - mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi); } } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index b5cb49e57df8..c94dec6a1834 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -95,7 +95,7 @@ static void get_fam10h_pci_mmconf_base(void) return; /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is not enabled? */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b649f92287a2..433e8e4fb3a6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -858,8 +858,8 @@ static __init void svm_adjust_mmio_mask(void) return; /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; enc_bit = cpuid_ebx(0x8000001f) & 0x3f; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6eda2834fc05..853c40e89335 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3402,7 +3402,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: + case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 04aba7e80a36..a9639f663d25 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -529,7 +529,7 @@ void __init sme_enable(struct boot_params *bp) /* * No SME if Hypervisor bit is set. This check is here to * prevent a guest from trying to enable SME. For running as a - * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * KVM guest the MSR_AMD64_SYSCFG will be sufficient, but there * might be other hypervisors which emulate that MSR as non-zero * or even pass it through to the guest. * A malicious hypervisor can still trick a guest into this @@ -542,8 +542,8 @@ void __init sme_enable(struct boot_params *bp) return; /* For SME, check the SYSCFG MSR */ - msr = __rdmsr(MSR_K8_SYSCFG); - if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + msr = __rdmsr(MSR_AMD64_SYSCFG); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) return; } else { /* SEV state cannot be controlled by a command line option */ diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ae744b6a0785..dd40d3fea74e 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -284,7 +284,7 @@ static int __init early_root_info_init(void) /* need to take out [4G, TOM2) for RAM*/ /* SYS_CFG */ - address = MSR_K8_SYSCFG; + address = MSR_AMD64_SYSCFG; rdmsrl(address, val); /* TOP_MEM2 is enabled? */ if (val & (1<<21)) { diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 84c5d1b33d10..cc8391f86cdb 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -123,9 +123,9 @@ SYM_CODE_START(startup_32) */ btl $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags jnc .Ldone - movl $MSR_K8_SYSCFG, %ecx + movl $MSR_AMD64_SYSCFG, %ecx rdmsr - bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax + bts $MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT, %eax jc .Ldone /* diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9fa4dfc6ebee..f0d8f60acee1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3083,7 +3083,7 @@ static void read_mc_regs(struct amd64_pvt *pvt) edac_dbg(0, " TOP_MEM: 0x%016llx\n", pvt->top_mem); /* Check first whether TOP_MEM2 is enabled: */ - rdmsrl(MSR_K8_SYSCFG, msr_val); + rdmsrl(MSR_AMD64_SYSCFG, msr_val); if (msr_val & BIT(21)) { rdmsrl(MSR_K8_TOP_MEM2, pvt->top_mem2); edac_dbg(0, " TOP_MEM2: 0x%016llx\n", pvt->top_mem2); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 45029354e0a8..c60b09e7602f 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -533,9 +533,9 @@ /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d -#define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 -#define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) +#define MSR_AMD64_SYSCFG 0xc0010010 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23 +#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT) #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 -- cgit v1.2.3 From f279b49f13bd2151bbe402a1d812c1e3646c4bbb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 01:28:40 -0700 Subject: x86/boot: Modernize genimage script; hdimage+EFI support The image generation scripts in arch/x86/boot are pretty out of date, except for the isoimage target. Update and clean up the genimage.sh script, and make it support an arbitrary number of initramfs files in the image. Add a "hdimage" target, which can be booted by either BIOS or EFI (if the kernel is compiled with the EFI stub.) For EFI to be able to pass the command line to the kernel, we need the EFI shell, but the firmware builtin EFI shell, if it even exists, is pretty much always the last resort boot option, so search for OVMF or EDK2 and explicitly include a copy of the EFI shell. To make this all work, use bash features in the script. Furthermore, this version of the script makes use of some mtools features, especially mpartition, that might not exist in very old version of mtools, but given all the other dependencies on this script this doesn't seem such a big deal. Finally, put a volume label ("LINUX_BOOT") on all generated images. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510082840.628372-1-hpa@zytor.com --- arch/x86/Makefile | 5 +- arch/x86/boot/.gitignore | 1 + arch/x86/boot/Makefile | 44 ++++--- arch/x86/boot/genimage.sh | 303 +++++++++++++++++++++++++++++++------------ arch/x86/boot/mtools.conf.in | 3 + 5 files changed, 252 insertions(+), 104 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c77c5d8a7b3e..d42764c60a0a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -256,7 +256,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/ boot := arch/x86/boot -BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage +BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage PHONY += bzImage $(BOOT_TARGETS) @@ -314,8 +314,9 @@ define archhelp echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' hdimage - Create a BIOS/EFI hard disk image (arch/x86/boot/hdimage)' echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' - echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' bzdisk/fdimage*/hdimage/isoimage also accept:' echo ' FDARGS="..." arguments for the booted kernel' echo ' FDINITRD=file initrd for the booted kernel' echo '' diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index 9cc7f1357b9b..1189be057ebd 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -11,3 +11,4 @@ setup.elf fdimage mtools.conf image.iso +hdimage diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index fe605205b4ce..dfbc26a8e924 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -29,7 +29,7 @@ KCOV_INSTRUMENT := n SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage -targets += fdimage fdimage144 fdimage288 image.iso mtools.conf +targets += fdimage fdimage144 fdimage288 image.iso hdimage subdir- := compressed setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o @@ -115,47 +115,49 @@ $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed $@ # Set this if you want to pass append arguments to the -# bzdisk/fdimage/isoimage kernel +# bzdisk/fdimage/hdimage/isoimage kernel FDARGS = -# Set this if you want an initrd included with the -# bzdisk/fdimage/isoimage kernel +# Set this if you want one or more initrds included in the image FDINITRD = -image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,) +imgdeps = $(obj)/bzImage $(obj)/mtools.conf $(src)/genimage.sh $(obj)/mtools.conf: $(src)/mtools.conf.in sed -e 's|@OBJ@|$(obj)|g' < $< > $@ +targets += mtools.conf + +# genimage.sh requires bash, but it also has a bunch of other +# external dependencies. quiet_cmd_genimage = GENIMAGE $3 -cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ - $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD) +cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \ + $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD) -PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install +PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage install # This requires write access to /dev/fd0 -bzdisk: $(obj)/bzImage $(obj)/mtools.conf +# All images require syslinux to be installed; hdimage also requires +# EDK2/OVMF if the kernel is compiled with the EFI stub. +bzdisk: $(imgdeps) $(call cmd,genimage,bzdisk,/dev/fd0) -# These require being root or having syslinux 2.02 or higher installed -fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf +fdimage fdimage144: $(imgdeps) $(call cmd,genimage,fdimage144,$(obj)/fdimage) @$(kecho) 'Kernel: $(obj)/fdimage is ready' -fdimage288: $(obj)/bzImage $(obj)/mtools.conf +fdimage288: $(imgdeps) $(call cmd,genimage,fdimage288,$(obj)/fdimage) @$(kecho) 'Kernel: $(obj)/fdimage is ready' -isoimage: $(obj)/bzImage +hdimage: $(imgdeps) + $(call cmd,genimage,hdimage,$(obj)/hdimage) + @$(kecho) 'Kernel: $(obj)/hdimage is ready' + +isoimage: $(imgdeps) $(call cmd,genimage,isoimage,$(obj)/image.iso) @$(kecho) 'Kernel: $(obj)/image.iso is ready' -bzlilo: - if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi - if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi - cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz - cp System.map $(INSTALL_PATH)/ - if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi - install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + $(CONFIG_SHELL) $(srctree)/$(src)/install.sh \ + $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index 6a10d52a4145..0673fdfc1a11 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # This file is subject to the terms and conditions of the GNU General Public # License. See the file "COPYING" in the main directory of this archive @@ -8,15 +8,24 @@ # # Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others # -# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture +# "make fdimage/fdimage144/fdimage288/hdimage/isoimage" +# script for x86 architecture # # Arguments: -# $1 - fdimage format -# $2 - target image file -# $3 - kernel bzImage file -# $4 - mtool configuration file -# $5 - kernel cmdline -# $6 - inird image file +# $1 - fdimage format +# $2 - target image file +# $3 - kernel bzImage file +# $4 - mtools configuration file +# $5 - kernel cmdline +# $6+ - initrd image file(s) +# +# This script requires: +# bash +# syslinux +# mtools (for fdimage* and hdimage) +# edk2/OVMF (for hdimage) +# +# Otherwise try to stick to POSIX shell commands... # # Use "make V=1" to debug this script @@ -26,105 +35,237 @@ case "${KBUILD_VERBOSE}" in ;; esac -verify () { - if [ ! -f "$1" ]; then - echo "" 1>&2 - echo " *** Missing file: $1" 1>&2 - echo "" 1>&2 - exit 1 +# Exit the top-level shell with an error +topshell=$$ +trap 'exit 1' USR1 +die() { + echo "" 1>&2 + echo " *** $*" 1>&2 + echo "" 1>&2 + kill -USR1 $topshell +} + +# Verify the existence and readability of a file +verify() { + if [ ! -f "$1" -o ! -r "$1" ]; then + die "Missing file: $1" fi } +diskfmt="$1" +FIMAGE="$2" +FBZIMAGE="$3" +MTOOLSRC="$4" +KCMDLINE="$5" +shift 5 # Remaining arguments = initrd files + +export MTOOLSRC -export MTOOLSRC=$4 -FIMAGE=$2 -FBZIMAGE=$3 -KCMDLINE=$5 -FDINITRD=$6 +# common options for dd +dd='dd iflag=fullblock' # Make sure the files actually exist verify "$FBZIMAGE" -genbzdisk() { - verify "$MTOOLSRC" - mformat a: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - a:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" a:initrd.img +declare -a FDINITRDS +irdpfx=' initrd=' +initrdopts_syslinux='' +initrdopts_efi='' +for f in "$@"; do + if [ -f "$f" -a -r "$f" ]; then + FDINITRDS=("${FDINITRDS[@]}" "$f") + fname="$(basename "$f")" + initrdopts_syslinux="${initrdopts_syslinux}${irdpfx}${fname}" + irdpfx=, + initrdopts_efi="${initrdopts_efi} initrd=${fname}" fi - mcopy $FBZIMAGE a:linux +done + +# Read a $3-byte littleendian unsigned value at offset $2 from file $1 +le() { + local n=0 + local m=1 + for b in $(od -A n -v -j $2 -N $3 -t u1 "$1"); do + n=$((n + b*m)) + m=$((m * 256)) + done + echo $n } -genfdimage144() { - verify "$MTOOLSRC" - dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null - mformat v: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - v:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" v:initrd.img - fi - mcopy $FBZIMAGE v:linux +# Get the EFI architecture name such that boot{name}.efi is the default +# boot file name. Returns false with no output if the file is not an +# EFI image or otherwise unknown. +efiarch() { + [ -f "$1" ] || return + [ $(le "$1" 0 2) -eq 23117 ] || return # MZ magic + peoffs=$(le "$1" 60 4) # PE header offset + [ $peoffs -ge 64 ] || return + [ $(le "$1" $peoffs 4) -eq 17744 ] || return # PE magic + case $(le "$1" $((peoffs+4+20)) 2) in # PE type + 267) ;; # PE32 + 523) ;; # PE32+ + *) return 1 ;; # Invalid + esac + [ $(le "$1" $((peoffs+4+20+68)) 2) -eq 10 ] || return # EFI app + case $(le "$1" $((peoffs+4)) 2) in # Machine type + 332) echo i386 ;; + 450) echo arm ;; + 512) echo ia64 ;; + 20530) echo riscv32 ;; + 20580) echo riscv64 ;; + 20776) echo riscv128 ;; + 34404) echo x64 ;; + 43620) echo aa64 ;; + esac } -genfdimage288() { - verify "$MTOOLSRC" - dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null - mformat w: - syslinux $FIMAGE - echo "$KCMDLINE" | mcopy - W:syslinux.cfg - if [ -f "$FDINITRD" ] ; then - mcopy "$FDINITRD" w:initrd.img - fi - mcopy $FBZIMAGE w:linux +# Get the combined sizes in bytes of the files given, counting sparse +# files as full length, and padding each file to a 4K block size +filesizes() { + local t=0 + local s + for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do + t=$((t + ((s+4095)/4096)*4096)) + done + echo $t } -geniso() { - tmp_dir=`dirname $FIMAGE`/isoimage - rm -rf $tmp_dir - mkdir $tmp_dir - for i in lib lib64 share ; do - for j in syslinux ISOLINUX ; do - if [ -f /usr/$i/$j/isolinux.bin ] ; then - isolinux=/usr/$i/$j/isolinux.bin - fi +# Expand directory names which should be in /usr/share into a list +# of possible alternatives +sharedirs() { + local dir file + for dir in /usr/share /usr/lib64 /usr/lib; do + for file; do + echo "$dir/$file" + echo "$dir/${file^^}" done - for j in syslinux syslinux/modules/bios ; do - if [ -f /usr/$i/$j/ldlinux.c32 ]; then - ldlinux=/usr/$i/$j/ldlinux.c32 - fi + done +} +efidirs() { + local dir file + for dir in /usr/share /boot /usr/lib64 /usr/lib; do + for file; do + echo "$dir/$file" + echo "$dir/${file^^}" done - if [ -n "$isolinux" -a -n "$ldlinux" ] ; then - break + done +} + +findsyslinux() { + local f="$(find -L $(sharedirs syslinux isolinux) \ + -name "$1" -readable -type f -print -quit 2>/dev/null)" + if [ ! -f "$f" ]; then + die "Need a $1 file, please install syslinux/isolinux." + fi + echo "$f" + return 0 +} + +findovmf() { + local arch="$1" + shift + local -a names=(-false) + local name f + for name; do + names=("${names[@]}" -or -iname "$name") + done + for f in $(find -L $(efidirs edk2 ovmf) \ + \( "${names[@]}" \) -readable -type f \ + -print 2>/dev/null); do + if [ "$(efiarch "$f")" = "$arch" ]; then + echo "$f" + return 0 fi done - if [ -z "$isolinux" ] ; then - echo 'Need an isolinux.bin file, please install syslinux/isolinux.' - exit 1 + die "Need a $1 file for $arch, please install EDK2/OVMF." +} + +do_mcopy() { + if [ ${#FDINITRDS[@]} -gt 0 ]; then + mcopy "${FDINITRDS[@]}" "$1" + fi + if [ -n "$efishell" ]; then + mmd "$1"EFI "$1"EFI/Boot + mcopy "$efishell" "$1"EFI/Boot/boot${kefiarch}.efi fi - if [ -z "$ldlinux" ] ; then - echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.' - exit 1 + if [ -n "$kefiarch" ]; then + echo linux "$KCMDLINE$initrdopts_efi" | \ + mcopy - "$1"startup.nsh fi - cp $isolinux $tmp_dir - cp $ldlinux $tmp_dir - cp $FBZIMAGE $tmp_dir/linux - echo "$KCMDLINE" > $tmp_dir/isolinux.cfg - if [ -f "$FDINITRD" ] ; then - cp "$FDINITRD" $tmp_dir/initrd.img + echo default linux "$KCMDLINE$initrdopts_syslinux" | \ + mcopy - "$1"syslinux.cfg + mcopy "$FBZIMAGE" "$1"linux +} + +genbzdisk() { + verify "$MTOOLSRC" + mformat -v 'LINUX_BOOT' a: + syslinux "$FIMAGE" + do_mcopy a: +} + +genfdimage144() { + verify "$MTOOLSRC" + $dd if=/dev/zero of="$FIMAGE" bs=1024 count=1440 2>/dev/null + mformat -v 'LINUX_BOOT' v: + syslinux "$FIMAGE" + do_mcopy v: +} + +genfdimage288() { + verify "$MTOOLSRC" + $dd if=/dev/zero of="$FIMAGE" bs=1024 count=2880 2>/dev/null + mformat -v 'LINUX_BOOT' w: + syslinux "$FIMAGE" + do_mcopy w: +} + +genhdimage() { + verify "$MTOOLSRC" + mbr="$(findsyslinux mbr.bin)" + kefiarch="$(efiarch "$FBZIMAGE")" + if [ -n "$kefiarch" ]; then + # The efishell provides command line handling + efishell="$(findovmf $kefiarch shell.efi shell${kefiarch}.efi)" + ptype='-T 0xef' # EFI system partition, no GPT fi - genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \ - -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \ - -boot-info-table $tmp_dir - isohybrid $FIMAGE 2>/dev/null || true - rm -rf $tmp_dir + sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell") + # Allow 1% + 1 MiB for filesystem and partition table overhead, + # syslinux, and config files + megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024))) + $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null + mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h: + $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null + mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h: + syslinux --offset $((512*512)) "$FIMAGE" + do_mcopy h: +} + +geniso() { + tmp_dir="$(dirname "$FIMAGE")/isoimage" + rm -rf "$tmp_dir" + mkdir "$tmp_dir" + isolinux=$(findsyslinux isolinux.bin) + ldlinux=$(findsyslinux ldlinux.c32) + cp "$isolinux" "$ldlinux" "$tmp_dir" + cp "$FBZIMAGE" "$tmp_dir"/linux + echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg + cp "${FDINITRDS[@]}" "$tmp_dir"/ + genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ + -quiet -o "$FIMAGE" -b isolinux.bin \ + -c boot.cat -no-emul-boot -boot-load-size 4 \ + -boot-info-table "$tmp_dir" + isohybrid "$FIMAGE" 2>/dev/null || true + rm -rf "$tmp_dir" } -case $1 in +rm -f "$FIMAGE" + +case "$diskfmt" in bzdisk) genbzdisk;; fdimage144) genfdimage144;; fdimage288) genfdimage288;; + hdimage) genhdimage;; isoimage) geniso;; - *) echo 'Unknown image format'; exit 1; + *) die "Unknown image format: $diskfmt";; esac diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in index efd6d2490c1d..9e2662d01364 100644 --- a/arch/x86/boot/mtools.conf.in +++ b/arch/x86/boot/mtools.conf.in @@ -14,4 +14,7 @@ drive v: drive w: file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter +# Hard disk +drive h: + file="@OBJ@/hdimage" partition=1 mformat_only -- cgit v1.2.3 From be5bb8021c9731f5593de6419ae35d3f16a3e497 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 02:09:38 -0700 Subject: x86/asm: Have the __ASM_FORM macros handle commas in arguments The __ASM_FORM macros are really useful, but in order to be able to use them to define instructions via .byte directives breaks because of the necessary commas. Change the macros to handle commas correctly. [ mingo: Removed stray whitespaces & aligned the definitions vertically. ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510090940.924953-2-hpa@zytor.com --- arch/x86/include/asm/asm.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 0603c7423aca..93aad0b63806 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,25 +3,24 @@ #define _ASM_X86_ASM_H #ifdef __ASSEMBLY__ -# define __ASM_FORM(x) x -# define __ASM_FORM_RAW(x) x -# define __ASM_FORM_COMMA(x) x, +# define __ASM_FORM(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, #else #include - -# define __ASM_FORM(x) " " __stringify(x) " " -# define __ASM_FORM_RAW(x) __stringify(x) -# define __ASM_FORM_COMMA(x) " " __stringify(x) "," +# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " " +# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__) +# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," #endif #ifndef __x86_64__ /* 32 bit */ -# define __ASM_SEL(a,b) __ASM_FORM(a) -# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) +# define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) #else /* 64 bit */ -# define __ASM_SEL(a,b) __ASM_FORM(b) -# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) +# define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) #endif #define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ -- cgit v1.2.3 From d88be187a6e6f3a97dfa7ddc500bb9ca191b3772 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 02:09:39 -0700 Subject: x86/asm: Add _ASM_BYTES() macro for a .byte ... opcode sequence Make it easy to create a sequence of bytes that can be used in either assembly proper on in a C asm() statement. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510090940.924953-3-hpa@zytor.com --- arch/x86/include/asm/asm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 93aad0b63806..507a37a46027 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -13,6 +13,8 @@ # define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," #endif +#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;) + #ifndef __x86_64__ /* 32 bit */ # define __ASM_SEL(a,b) __ASM_FORM(a) -- cgit v1.2.3 From eef23e72b78b36924aea8be5ec7c54e628c442ef Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 02:09:40 -0700 Subject: x86/asm: Use _ASM_BYTES() in Use the new generalized _ASM_BYTES() macro from instead of the "home grown" _ASM_MK_NOP() in . Add and update in the tools directory... Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510090940.924953-4-hpa@zytor.com --- arch/x86/include/asm/nops.h | 24 ++--- tools/arch/x86/include/asm/asm.h | 189 ++++++++++++++++++++++++++++++++++++++ tools/arch/x86/include/asm/nops.h | 24 ++--- 3 files changed, 209 insertions(+), 28 deletions(-) create mode 100644 tools/arch/x86/include/asm/asm.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index c1e5e818ba16..c5573eaa5bb9 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H +#include + /* * Define nops for use with alternative() and for tracing. */ @@ -57,20 +59,14 @@ #endif /* CONFIG_64BIT */ -#ifdef __ASSEMBLY__ -#define _ASM_MK_NOP(x) .byte x -#else -#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" -#endif - -#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) +#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1) +#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2) +#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3) +#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4) +#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5) +#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6) +#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) +#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) #define ASM_NOP_MAX 8 diff --git a/tools/arch/x86/include/asm/asm.h b/tools/arch/x86/include/asm/asm.h new file mode 100644 index 000000000000..507a37a46027 --- /dev/null +++ b/tools/arch/x86/include/asm/asm.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ASM_H +#define _ASM_X86_ASM_H + +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ +# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, +#else +#include +# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " " +# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__) +# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) "," +#endif + +#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;) + +#ifndef __x86_64__ +/* 32 bit */ +# define __ASM_SEL(a,b) __ASM_FORM(a) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a) +#else +/* 64 bit */ +# define __ASM_SEL(a,b) __ASM_FORM(b) +# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b) +#endif + +#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \ + inst##q##__VA_ARGS__) +#define __ASM_REG(reg) __ASM_SEL_RAW(e##reg, r##reg) + +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) + +#define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) +#define _ASM_MUL __ASM_SIZE(mul) + +#define _ASM_AX __ASM_REG(ax) +#define _ASM_BX __ASM_REG(bx) +#define _ASM_CX __ASM_REG(cx) +#define _ASM_DX __ASM_REG(dx) +#define _ASM_SP __ASM_REG(sp) +#define _ASM_BP __ASM_REG(bp) +#define _ASM_SI __ASM_REG(si) +#define _ASM_DI __ASM_REG(di) + +#ifndef __x86_64__ +/* 32 bit */ + +#define _ASM_ARG1 _ASM_AX +#define _ASM_ARG2 _ASM_DX +#define _ASM_ARG3 _ASM_CX + +#define _ASM_ARG1L eax +#define _ASM_ARG2L edx +#define _ASM_ARG3L ecx + +#define _ASM_ARG1W ax +#define _ASM_ARG2W dx +#define _ASM_ARG3W cx + +#define _ASM_ARG1B al +#define _ASM_ARG2B dl +#define _ASM_ARG3B cl + +#else +/* 64 bit */ + +#define _ASM_ARG1 _ASM_DI +#define _ASM_ARG2 _ASM_SI +#define _ASM_ARG3 _ASM_DX +#define _ASM_ARG4 _ASM_CX +#define _ASM_ARG5 r8 +#define _ASM_ARG6 r9 + +#define _ASM_ARG1Q rdi +#define _ASM_ARG2Q rsi +#define _ASM_ARG3Q rdx +#define _ASM_ARG4Q rcx +#define _ASM_ARG5Q r8 +#define _ASM_ARG6Q r9 + +#define _ASM_ARG1L edi +#define _ASM_ARG2L esi +#define _ASM_ARG3L edx +#define _ASM_ARG4L ecx +#define _ASM_ARG5L r8d +#define _ASM_ARG6L r9d + +#define _ASM_ARG1W di +#define _ASM_ARG2W si +#define _ASM_ARG3W dx +#define _ASM_ARG4W cx +#define _ASM_ARG5W r8w +#define _ASM_ARG6W r9w + +#define _ASM_ARG1B dil +#define _ASM_ARG2B sil +#define _ASM_ARG3B dl +#define _ASM_ARG4B cl +#define _ASM_ARG5B r8b +#define _ASM_ARG6B r9b + +#endif + +/* + * Macros to generate condition code outputs from inline assembly, + * The output operand must be type "bool". + */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +# define CC_SET(c) "\n\t/* output condition code " #c "*/\n" +# define CC_OUT(c) "=@cc" #c +#else +# define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n" +# define CC_OUT(c) [_cc_ ## c] "=qm" +#endif + +/* Exception table entry */ +#ifdef __ASSEMBLY__ +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + .pushsection "__ex_table","a" ; \ + .balign 4 ; \ + .long (from) - . ; \ + .long (to) - . ; \ + .long (handler) - . ; \ + .popsection + +# define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + +# define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + +# define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + +# ifdef CONFIG_KPROBES +# define _ASM_NOKPROBE(entry) \ + .pushsection "_kprobe_blacklist","aw" ; \ + _ASM_ALIGN ; \ + _ASM_PTR (entry); \ + .popsection +# else +# define _ASM_NOKPROBE(entry) +# endif + +#else /* ! __ASSEMBLY__ */ +# define _EXPAND_EXTABLE_HANDLE(x) #x +# define _ASM_EXTABLE_HANDLE(from, to, handler) \ + " .pushsection \"__ex_table\",\"a\"\n" \ + " .balign 4\n" \ + " .long (" #from ") - .\n" \ + " .long (" #to ") - .\n" \ + " .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n" \ + " .popsection\n" + +# define _ASM_EXTABLE(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_default) + +# define _ASM_EXTABLE_UA(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess) + +# define _ASM_EXTABLE_CPY(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_copy) + +# define _ASM_EXTABLE_FAULT(from, to) \ + _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) + +/* For C file, we already have NOKPROBE_SYMBOL macro */ + +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned long current_stack_pointer asm(_ASM_SP); +#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_ASM_H */ diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h index c1e5e818ba16..c5573eaa5bb9 100644 --- a/tools/arch/x86/include/asm/nops.h +++ b/tools/arch/x86/include/asm/nops.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_NOPS_H #define _ASM_X86_NOPS_H +#include + /* * Define nops for use with alternative() and for tracing. */ @@ -57,20 +59,14 @@ #endif /* CONFIG_64BIT */ -#ifdef __ASSEMBLY__ -#define _ASM_MK_NOP(x) .byte x -#else -#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" -#endif - -#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1) -#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2) -#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3) -#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4) -#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5) -#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6) -#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7) -#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8) +#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1) +#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2) +#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3) +#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4) +#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5) +#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6) +#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) +#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) #define ASM_NOP_MAX 8 -- cgit v1.2.3 From 637be9183e0475c430fc77162c222bcaab887989 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 8 May 2021 00:07:46 +0200 Subject: asm-generic: use asm-generic/unaligned.h for most architectures There are several architectures that just duplicate the contents of asm-generic/unaligned.h, so change those over to use the file directly, to make future modifications easier. The exceptions are: - arm32 sets HAVE_EFFICIENT_UNALIGNED_ACCESS, but wants the unaligned-struct version - ppc64le disables HAVE_EFFICIENT_UNALIGNED_ACCESS but includes the access-ok version - most m68k also uses the access-ok version without setting HAVE_EFFICIENT_UNALIGNED_ACCESS. - sh4a has a custom inline asm version - openrisc is the only one using the memmove version that generally leads to worse code. Signed-off-by: Arnd Bergmann Reviewed-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven --- arch/alpha/include/asm/unaligned.h | 12 ------------ arch/ia64/include/asm/unaligned.h | 12 ------------ arch/m68k/include/asm/unaligned.h | 9 +-------- arch/microblaze/include/asm/unaligned.h | 27 --------------------------- arch/parisc/include/asm/unaligned.h | 6 +----- arch/sparc/include/asm/unaligned.h | 11 ----------- arch/x86/include/asm/unaligned.h | 15 --------------- arch/xtensa/include/asm/unaligned.h | 29 ----------------------------- 8 files changed, 2 insertions(+), 119 deletions(-) delete mode 100644 arch/alpha/include/asm/unaligned.h delete mode 100644 arch/ia64/include/asm/unaligned.h delete mode 100644 arch/microblaze/include/asm/unaligned.h delete mode 100644 arch/sparc/include/asm/unaligned.h delete mode 100644 arch/x86/include/asm/unaligned.h delete mode 100644 arch/xtensa/include/asm/unaligned.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/unaligned.h b/arch/alpha/include/asm/unaligned.h deleted file mode 100644 index 863c807b66f8..000000000000 --- a/arch/alpha/include/asm/unaligned.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_ALPHA_UNALIGNED_H -#define _ASM_ALPHA_UNALIGNED_H - -#include -#include -#include - -#define get_unaligned __get_unaligned_le -#define put_unaligned __put_unaligned_le - -#endif /* _ASM_ALPHA_UNALIGNED_H */ diff --git a/arch/ia64/include/asm/unaligned.h b/arch/ia64/include/asm/unaligned.h deleted file mode 100644 index 328942e3cbce..000000000000 --- a/arch/ia64/include/asm/unaligned.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_IA64_UNALIGNED_H -#define _ASM_IA64_UNALIGNED_H - -#include -#include -#include - -#define get_unaligned __get_unaligned_le -#define put_unaligned __put_unaligned_le - -#endif /* _ASM_IA64_UNALIGNED_H */ diff --git a/arch/m68k/include/asm/unaligned.h b/arch/m68k/include/asm/unaligned.h index 98c8930d3d35..84e437337344 100644 --- a/arch/m68k/include/asm/unaligned.h +++ b/arch/m68k/include/asm/unaligned.h @@ -2,15 +2,8 @@ #ifndef _ASM_M68K_UNALIGNED_H #define _ASM_M68K_UNALIGNED_H - #ifdef CONFIG_CPU_HAS_NO_UNALIGNED -#include -#include -#include - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - +#include #else /* * The m68k can do unaligned accesses itself. diff --git a/arch/microblaze/include/asm/unaligned.h b/arch/microblaze/include/asm/unaligned.h deleted file mode 100644 index 448299beab69..000000000000 --- a/arch/microblaze/include/asm/unaligned.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2008 Michal Simek - * Copyright (C) 2006 Atmark Techno, Inc. - */ - -#ifndef _ASM_MICROBLAZE_UNALIGNED_H -#define _ASM_MICROBLAZE_UNALIGNED_H - -# ifdef __KERNEL__ - -# ifdef __MICROBLAZEEL__ -# include -# include -# define get_unaligned __get_unaligned_le -# define put_unaligned __put_unaligned_le -# else -# include -# include -# define get_unaligned __get_unaligned_be -# define put_unaligned __put_unaligned_be -# endif - -# include - -# endif /* __KERNEL__ */ -#endif /* _ASM_MICROBLAZE_UNALIGNED_H */ diff --git a/arch/parisc/include/asm/unaligned.h b/arch/parisc/include/asm/unaligned.h index e9029c7c2a69..3bda16773ba6 100644 --- a/arch/parisc/include/asm/unaligned.h +++ b/arch/parisc/include/asm/unaligned.h @@ -2,11 +2,7 @@ #ifndef _ASM_PARISC_UNALIGNED_H #define _ASM_PARISC_UNALIGNED_H -#include -#include -#include -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be +#include #ifdef __KERNEL__ struct pt_regs; diff --git a/arch/sparc/include/asm/unaligned.h b/arch/sparc/include/asm/unaligned.h deleted file mode 100644 index 7971d89d2f54..000000000000 --- a/arch/sparc/include/asm/unaligned.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_SPARC_UNALIGNED_H -#define _ASM_SPARC_UNALIGNED_H - -#include -#include -#include -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_SPARC_UNALIGNED_H */ diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h deleted file mode 100644 index 9c754a7447aa..000000000000 --- a/arch/x86/include/asm/unaligned.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_UNALIGNED_H -#define _ASM_X86_UNALIGNED_H - -/* - * The x86 can do unaligned accesses itself. - */ - -#include -#include - -#define get_unaligned __get_unaligned_le -#define put_unaligned __put_unaligned_le - -#endif /* _ASM_X86_UNALIGNED_H */ diff --git a/arch/xtensa/include/asm/unaligned.h b/arch/xtensa/include/asm/unaligned.h deleted file mode 100644 index 8e7ed046bfed..000000000000 --- a/arch/xtensa/include/asm/unaligned.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Xtensa doesn't handle unaligned accesses efficiently. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ -#ifndef _ASM_XTENSA_UNALIGNED_H -#define _ASM_XTENSA_UNALIGNED_H - -#include - -#ifdef __LITTLE_ENDIAN -# include -# include -# include -# define get_unaligned __get_unaligned_le -# define put_unaligned __put_unaligned_le -#else -# include -# include -# include -# define get_unaligned __get_unaligned_be -# define put_unaligned __put_unaligned_be -#endif - -#endif /* _ASM_XTENSA_UNALIGNED_H */ -- cgit v1.2.3 From 6627eb25e40cc8d135d3f8d5391851d18ac497d7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 11:53:10 -0700 Subject: x86/entry: Unify definitions from and The register offsets in are duplicated in entry/calling.h, but are formatted differently and therefore not compatible. Use the version from consistently. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-2-hpa@zytor.com --- arch/x86/entry/calling.h | 36 +----------------------------------- arch/x86/kernel/head_64.S | 6 +++--- 2 files changed, 4 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 07a9331d55e7..7436d4a74ecb 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include #include #include +#include /* @@ -62,41 +63,6 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -/* The layout forms the "struct pt_regs" on the stack: */ -/* - * C ABI says these regs are callee-preserved. They aren't saved on kernel entry - * unless syscall needs a complete, fully filled "struct pt_regs". - */ -#define R15 0*8 -#define R14 1*8 -#define R13 2*8 -#define R12 3*8 -#define RBP 4*8 -#define RBX 5*8 -/* These regs are callee-clobbered. Always saved on kernel entry. */ -#define R11 6*8 -#define R10 7*8 -#define R9 8*8 -#define R8 9*8 -#define RAX 10*8 -#define RCX 11*8 -#define RDX 12*8 -#define RSI 13*8 -#define RDI 14*8 -/* - * On syscall entry, this is syscall#. On CPU exception, this is error code. - * On hw interrupt, it's IRQ number: - */ -#define ORIG_RAX 15*8 -/* Return frame for iretq */ -#define RIP 16*8 -#define CS 17*8 -#define EFLAGS 18*8 -#define RSP 19*8 -#define SS 20*8 - -#define SIZEOF_PTREGS 21*8 - .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 .if \save_ret pushq %rsi /* pt_regs->si */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 04bddaaba8e2..d8b3ebd2bb85 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -62,7 +62,7 @@ SYM_CODE_START_NOALIGN(startup_64) */ /* Set up the stack for verify_cpu(), similar to initial_stack below */ - leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp + leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi pushq %rsi @@ -343,10 +343,10 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb) #endif /* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder + * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder * reliably detect the end of the stack. */ -SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS) +SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE) __FINITDATA __INIT -- cgit v1.2.3 From 3e5e7f7736b05d5fdf2cc4e0ba4f2d8bc42c630d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 11:53:11 -0700 Subject: x86/entry: Reverse arguments to do_syscall_64() Reverse the order of arguments to do_syscall_64() so that the first argument is the pt_regs pointer. This is not only consistent with *all* other entry points from assembly, but it actually makes the compiled code slightly better. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-3-hpa@zytor.com --- arch/x86/entry/common.c | 2 +- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/syscall.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7b2542b13ebd..00da0f5420de 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -36,7 +36,7 @@ #include #ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) +__visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a16a5294d55f..1d9db15fdc69 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -107,8 +107,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) PUSH_AND_CLEAR_REGS rax=$-ENOSYS /* IRQs are off. */ - movq %rax, %rdi - movq %rsp, %rsi + movq %rsp, %rdi + movq %rax, %rsi call do_syscall_64 /* returns with IRQs disabled */ /* diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7cbf733d11af..4e20054d7533 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -160,7 +160,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(unsigned long nr, struct pt_regs *regs); +void do_syscall_64(struct pt_regs *regs, unsigned long nr); void do_int80_syscall_32(struct pt_regs *regs); long do_fast_syscall_32(struct pt_regs *regs); -- cgit v1.2.3 From dce0aa3b2ef28900cc4c779c59a870f1b4bdadee Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 11:53:12 -0700 Subject: x86/syscall: Unconditionally prototype {ia32,x32}_sys_call_table[] Even if these APIs are disabled, and the arrays therefore do not exist, having the prototypes allows us to use IS_ENABLED() rather than using #ifdefs. If something ends up trying to actually *use* these arrays a linker error will ensue. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-4-hpa@zytor.com --- arch/x86/include/asm/syscall.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4e20054d7533..f6593cafdbd9 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -21,13 +21,12 @@ extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) #define ia32_sys_call_table sys_call_table -#endif - -#if defined(CONFIG_IA32_EMULATION) +#else +/* + * These may not exist, but still put the prototypes in so we + * can use IS_ENABLED(). + */ extern const sys_call_ptr_t ia32_sys_call_table[]; -#endif - -#ifdef CONFIG_X86_X32_ABI extern const sys_call_ptr_t x32_sys_call_table[]; #endif -- cgit v1.2.3 From 6de4ac1d03f75248974a398110b15af0bfe65a11 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 11:53:13 -0700 Subject: x86/syscall: Maximize MSR_SYSCALL_MASK It is better to clear as many flags as possible when we do a system call entry, as opposed to the other way around. The fewer flags we keep, the lesser the possible interference between the kernel and user space. The flags changed are: - CF, PF, AF, ZF, SF, OF: these are arithmetic flags which affect branches, possibly speculatively. They should be cleared for the same reasons we now clear all GPRs on entry. - RF: suppresses a code breakpoint on the subsequent instruction. It is probably impossible to enter the kernel with RF set, but if it is somehow not, it would break a kernel debugger setting a breakpoint on the entry point. Either way, user space should not be able to control kernel behavior here. - ID: this flag has no direct effect (it is a scratch bit only.) However, there is no reason to retain the user space value in the kernel, and the standard should be to clear unless needed, not the other way around. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-5-hpa@zytor.com --- arch/x86/kernel/cpu/common.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a1b756c49a93..6cf697574661 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1773,10 +1773,16 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif - /* Flags to clear on syscall */ + /* + * Flags to clear on syscall; clear as much as possible + * to minimize user space-kernel interference. + */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| - X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); + X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| + X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| + X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| + X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| + X86_EFLAGS_AC|X86_EFLAGS_ID); } #else /* CONFIG_X86_64 */ -- cgit v1.2.3 From 29e9758966f47004bd7245e6adadcb708386f36a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Mon, 10 May 2021 11:53:14 -0700 Subject: x86/entry: Split PUSH_AND_CLEAR_REGS into two submacros PUSH_AND_CLEAR_REGS, as the name implies, performs two functions: pushing registers and clearing registers. They don't necessarily have to be performed in immediate sequence, although all current users do. Split it into two macros for the case where that isn't desired; the FRED enabling patchset will eventually make use of this. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-6-hpa@zytor.com --- arch/x86/entry/calling.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 7436d4a74ecb..a4c061fb7c6e 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -63,7 +63,7 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 +.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0 .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -90,7 +90,9 @@ For 32-bit we have the following conventions - kernel is built with .if \save_ret pushq %rsi /* return address on top of stack */ .endif +.endm +.macro CLEAR_REGS /* * Sanitize registers of values that a speculation attack might * otherwise want to exploit. The lower registers are likely clobbered @@ -112,6 +114,11 @@ For 32-bit we have the following conventions - kernel is built with .endm +.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 + PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret + CLEAR_REGS +.endm + .macro POP_REGS pop_rdi=1 skip_r11rcx=0 popq %r15 popq %r14 -- cgit v1.2.3 From 9ddcb87b9218dec760e8d8a780bc8ad514c3d36a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 10 May 2021 11:53:15 -0700 Subject: x86/regs: Syscall_get_nr() returns -1 for a non-system call syscall_get_nr() is defined to return -1 for a non-system call or a ptrace/seccomp restart; not just any arbitrary number. See comment in for the official definition of this function. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210510185316.3307264-7-hpa@zytor.com --- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 87a4143aa7d7..4c208ea3bd9f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -911,7 +911,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) * syscall with TS_COMPAT still set. */ regs->orig_ax = value; - if (syscall_get_nr(child, regs) >= 0) + if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index a06cb107c0e8..e12779a2714d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -713,7 +713,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); /* Are we from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* If so, check system call restarting.. */ switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: @@ -793,7 +793,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) } /* Did we come from a system call? */ - if (syscall_get_nr(current, regs) >= 0) { + if (syscall_get_nr(current, regs) != -1) { /* Restart the system call - no handlers present */ switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: -- cgit v1.2.3 From 63b3f96e1a989846a5a521d4fbef4bc86406929d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 May 2021 22:43:39 +0200 Subject: kvm: Select SCHED_INFO instead of TASK_DELAY_ACCT AFAICT KVM only relies on SCHED_INFO. Nothing uses the p->delays data that belongs to TASK_DELAY_ACCT. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Paolo Bonzini Acked-by: Marc Zyngier Acked-by: Balbir Singh Link: https://lkml.kernel.org/r/20210505111525.187225172@infradead.org --- arch/arm64/kvm/Kconfig | 5 +---- arch/x86/kvm/Kconfig | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 3964acf5451e..a4eba0908bfa 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -20,8 +20,6 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET && MULTIUSER select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -38,8 +36,7 @@ menuconfig KVM select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_VCPU_RUN_PID_CHANGE - select TASKSTATS - select TASK_DELAY_ACCT + select SCHED_INFO help Support hosting virtualized guest machines. diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f6b93a35ce14..fb8efb387aff 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -22,8 +22,6 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM depends on HIGH_RES_TIMERS - # for TASKSTATS/TASK_DELAY_ACCT: - depends on NET && MULTIUSER depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER @@ -36,8 +34,7 @@ config KVM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO - select TASKSTATS - select TASK_DELAY_ACCT + select SCHED_INFO select PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT -- cgit v1.2.3 From 64e1f5872a8c3d80bce4686b4ab5dbc6e6bd30c5 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 6 May 2021 22:07:26 +0300 Subject: x86/alternatives: Make the x86nops[] symbol static Sparse says: arch/x86/kernel/alternative.c:78:21: warning: symbol 'x86nops' was not declared. Should it be static? Since x86nops[] is not used outside this file, Sparse is right and it can be made static. Signed-off-by: Pavel Skripkin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506190726.15575-1-paskripkin@gmail.com --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6974b5174495..75c752b0628c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -75,7 +75,7 @@ do { \ } \ } while (0) -const unsigned char x86nops[] = +static const unsigned char x86nops[] = { BYTES_NOP1, BYTES_NOP2, -- cgit v1.2.3 From 1bc67873d401e6c2e6e30be7fef21337db07a042 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 May 2021 11:33:10 +0200 Subject: x86/asm: Simplify __smp_mb() definition Drop the bitness ifdeffery in favor of using _ASM_SP, which is the helper macro for the rSP register specification for 32 and 64 bit depending on the build. No functional changes. Signed-off-by: Borislav Petkov Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210512093310.5635-1-bp@alien8.de --- arch/x86/include/asm/barrier.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 4819d5e5a335..3ba772a69cc8 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -54,11 +54,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define dma_rmb() barrier() #define dma_wmb() barrier() -#ifdef CONFIG_X86_32 -#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc") -#else -#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc") -#endif +#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") + #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -- cgit v1.2.3 From f1a0a376ca0c4ef1fc3d24e3e502acbb5b795674 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 12 May 2021 10:46:36 +0100 Subject: sched/core: Initialize the idle task with preemption disabled As pointed out by commit de9b8f5dcbd9 ("sched: Fix crash trying to dequeue/enqueue the idle thread") init_idle() can and will be invoked more than once on the same idle task. At boot time, it is invoked for the boot CPU thread by sched_init(). Then smp_init() creates the threads for all the secondary CPUs and invokes init_idle() on them. As the hotplug machinery brings the secondaries to life, it will issue calls to idle_thread_get(), which itself invokes init_idle() yet again. In this case it's invoked twice more per secondary: at _cpu_up(), and at bringup_cpu(). Given smp_init() already initializes the idle tasks for all *possible* CPUs, no further initialization should be required. Now, removing init_idle() from idle_thread_get() exposes some interesting expectations with regards to the idle task's preempt_count: the secondary startup always issues a preempt_disable(), requiring some reset of the preempt count to 0 between hot-unplug and hotplug, which is currently served by idle_thread_get() -> idle_init(). Given the idle task is supposed to have preemption disabled once and never see it re-enabled, it seems that what we actually want is to initialize its preempt_count to PREEMPT_DISABLED and leave it there. Do that, and remove init_idle() from idle_thread_get(). Secondary startups were patched via coccinelle: @begone@ @@ -preempt_disable(); ... cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20210512094636.2958515-1-valentin.schneider@arm.com --- arch/alpha/kernel/smp.c | 1 - arch/arc/kernel/smp.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm64/include/asm/preempt.h | 2 +- arch/arm64/kernel/smp.c | 1 - arch/csky/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/mips/kernel/smp.c | 1 - arch/openrisc/kernel/smp.c | 2 -- arch/parisc/kernel/smp.c | 1 - arch/powerpc/kernel/smp.c | 1 - arch/riscv/kernel/smpboot.c | 1 - arch/s390/include/asm/preempt.h | 4 ++-- arch/s390/kernel/smp.c | 1 - arch/sh/kernel/smp.c | 2 -- arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 3 --- arch/x86/include/asm/preempt.h | 2 +- arch/x86/kernel/smpboot.c | 1 - arch/xtensa/kernel/smp.c | 1 - include/asm-generic/preempt.h | 2 +- init/main.c | 6 +----- kernel/fork.c | 2 +- kernel/sched/core.c | 2 +- kernel/smpboot.c | 1 - 25 files changed, 8 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index f4dd9f3f3001..4b2575f936d4 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -166,7 +166,6 @@ smp_callin(void) DBGS(("smp_callin: commencing CPU %d current %p active_mm %p\n", cpuid, current, current->active_mm)); - preempt_disable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index 52906d314537..db0e104d6835 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -189,7 +189,6 @@ void start_kernel_secondary(void) pr_info("## CPU%u LIVE ##: Executing Code...\n", cpu); local_irq_enable(); - preempt_disable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 74679240a9d8..c7bb168b0d97 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -432,7 +432,6 @@ asmlinkage void secondary_start_kernel(void) #endif pr_debug("CPU%u: Booted secondary processor\n", cpu); - preempt_disable(); trace_hardirqs_off(); /* diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index 80e946b2abee..e83f0982b99c 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -23,7 +23,7 @@ static inline void preempt_count_set(u64 pc) } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static inline void set_preempt_need_resched(void) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dcd7041b2b07..6671000a8b7d 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -224,7 +224,6 @@ asmlinkage notrace void secondary_start_kernel(void) init_gic_priority_masking(); rcu_cpu_starting(cpu); - preempt_disable(); trace_hardirqs_off(); /* diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index 0f9f5eef9338..e2993539af8e 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -281,7 +281,6 @@ void csky_start_secondary(void) pr_info("CPU%u Online: %s...\n", cpu, __func__); local_irq_enable(); - preempt_disable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 49b488580939..d10f780c13b9 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -441,7 +441,6 @@ start_secondary (void *unused) #endif efi_map_pal_code(); cpu_init(); - preempt_disable(); smp_callin(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index ef86fbad8546..d542fb7af3ba 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -348,7 +348,6 @@ asmlinkage void start_secondary(void) */ calibrate_delay(); - preempt_disable(); cpu = smp_processor_id(); cpu_data[cpu].udelay_val = loops_per_jiffy; diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 48e1092a64de..415e209732a3 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -145,8 +145,6 @@ asmlinkage __init void secondary_start_kernel(void) set_cpu_online(cpu, true); local_irq_enable(); - - preempt_disable(); /* * OK, it's off to the idle thread for us */ diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 10227f667c8a..1405b603b91b 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -302,7 +302,6 @@ void __init smp_callin(unsigned long pdce_proc) #endif smp_cpu_init(slave_id); - preempt_disable(); flush_cache_all_local(); /* start with known state */ flush_tlb_all_local(NULL); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2e05c783440a..6c6e4d934d86 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1547,7 +1547,6 @@ void start_secondary(void *unused) smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); rcu_cpu_starting(cpu); - preempt_disable(); cpu_callin_map[cpu] = 1; if (smp_ops->setup_cpu) diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 9a408e2942ac..bd82375db51a 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -180,7 +180,6 @@ asmlinkage __visible void smp_callin(void) * Disable preemption before enabling interrupts, so we don't try to * schedule a CPU that hasn't actually started yet. */ - preempt_disable(); local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index b49e0492842c..23ff51be7e29 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -32,7 +32,7 @@ static inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_ENABLED; \ + S390_lowcore.preempt_count = PREEMPT_DISABLED; \ } while (0) static inline void set_preempt_need_resched(void) @@ -91,7 +91,7 @@ static inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_ENABLED; \ + S390_lowcore.preempt_count = PREEMPT_DISABLED; \ } while (0) static inline void set_preempt_need_resched(void) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2fec2b80d35d..111909aeb8d2 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -878,7 +878,6 @@ static void smp_init_secondary(void) restore_access_regs(S390_lowcore.access_regs_save_area); cpu_init(); rcu_cpu_starting(cpu); - preempt_disable(); init_cpu_timer(); vtime_init(); vdso_getcpu_init(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 372acdc9033e..65924d9ec245 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -186,8 +186,6 @@ asmlinkage void start_secondary(void) per_cpu_trap_init(); - preempt_disable(); - notify_cpu_starting(cpu); local_irq_enable(); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 50c127ab46d5..22b148e5a5f8 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -348,7 +348,6 @@ static void sparc_start_secondary(void *arg) */ arch_cpu_pre_starting(arg); - preempt_disable(); cpu = smp_processor_id(); notify_cpu_starting(cpu); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e38d8bf454e8..ae5faa1d989d 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -138,9 +138,6 @@ void smp_callin(void) set_cpu_online(cpuid, true); - /* idle thread is expected to have preempt disabled */ - preempt_disable(); - local_irq_enable(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index f8cb8af4de5c..fe5efbcba824 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc) #define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ + per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \ } while (0) /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0ad5214f598a..0936f5ba3222 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -236,7 +236,6 @@ static void notrace start_secondary(void *unused) cpu_init(); rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); - preempt_disable(); smp_callin(); enable_start_cpu0 = 0; diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index cd85a7a2722b..1254da07ead1 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -145,7 +145,6 @@ void secondary_start_kernel(void) cpumask_set_cpu(cpu, mm_cpumask(mm)); enter_lazy_tlb(mm, current); - preempt_disable(); trace_hardirqs_off(); calibrate_delay(); diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index d683f5e6d791..b4d43a4af5f7 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc) } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ } while (0) static __always_inline void set_preempt_need_resched(void) diff --git a/init/main.c b/init/main.c index eb01e121d2f1..7b027d9c5c89 100644 --- a/init/main.c +++ b/init/main.c @@ -941,11 +941,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) * time - but meanwhile we still have a functioning scheduler. */ sched_init(); - /* - * Disable preemption - early bootup scheduling is extremely - * fragile until we cpu_idle() for the first time. - */ - preempt_disable(); + if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index e7fd928fcafe..ace4631b5b54 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2412,7 +2412,7 @@ static inline void init_idle_pids(struct task_struct *idle) } } -struct task_struct *fork_idle(int cpu) +struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55b2d9399e12..9d00f4958bde 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8227,7 +8227,7 @@ void show_state_filter(unsigned long state_filter) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index f25208e8df83..e4163042c4d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -33,7 +33,6 @@ struct task_struct *idle_thread_get(unsigned int cpu) if (!tsk) return ERR_PTR(-ENOMEM); - init_idle(tsk, cpu); return tsk; } -- cgit v1.2.3 From 80870e6ece78ce67b91398db88fb6b92a178f574 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:54 +0200 Subject: x86, objtool: Dont exclude arch/x86/realmode/ Specifically, init.c uses jump_labels. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.516200011@infradead.org --- arch/x86/realmode/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index 6b1f3a4eeb44..a0b491ae2de8 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -10,7 +10,6 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n -OBJECT_FILES_NON_STANDARD := y subdir- := rm -- cgit v1.2.3 From 8bfafcdccb52e770695b12530b1f800fe98b16b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:55 +0200 Subject: jump_label, x86: Strip ASM jump_label support In prepration for variable size jump_label support; remove all ASM bits, which are currently unused. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.599716762@infradead.org --- arch/x86/include/asm/jump_label.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 610a05374c02..01de21e2d967 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -47,42 +47,6 @@ l_yes: return true; } -#else /* __ASSEMBLY__ */ - -.macro STATIC_JUMP_IF_TRUE target, key, def -.Lstatic_jump_\@: - .if \def - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .else - .byte BYTES_NOP5 - .endif - .pushsection __jump_table, "aw" - _ASM_ALIGN - .long .Lstatic_jump_\@ - ., \target - . - _ASM_PTR \key - . - .popsection -.endm - -.macro STATIC_JUMP_IF_FALSE target, key, def -.Lstatic_jump_\@: - .if \def - .byte BYTES_NOP5 - .else - /* Equivalent to "jmp.d32 \target" */ - .byte 0xe9 - .long \target - .Lstatic_jump_after_\@ -.Lstatic_jump_after_\@: - .endif - .pushsection __jump_table, "aw" - _ASM_ALIGN - .long .Lstatic_jump_\@ - ., \target - . - _ASM_PTR \key + 1 - . - .popsection -.endm - #endif /* __ASSEMBLY__ */ #endif -- cgit v1.2.3 From e1aa35c4c4bc71e44dabc9d7d167b807edd7b439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:56 +0200 Subject: jump_label, x86: Factor out the __jump_table generation Both arch_static_branch() and arch_static_branch_jump() have the same blurb to generate the __jump_table entry, share it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.663132781@infradead.org --- arch/x86/include/asm/jump_label.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 01de21e2d967..dfdc2b1c17dd 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -14,15 +14,19 @@ #include #include +#define JUMP_TABLE_ENTRY \ + ".pushsection __jump_table, \"aw\" \n\t" \ + _ASM_ALIGN "\n\t" \ + ".long 1b - . \n\t" \ + ".long %l[l_yes] - . \n\t" \ + _ASM_PTR "%c0 + %c1 - .\n\t" \ + ".popsection \n\t" + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" ".byte " __stringify(BYTES_NOP5) "\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - ".long 1b - ., %l[l_yes] - . \n\t" - _ASM_PTR "%c0 + %c1 - .\n\t" - ".popsection \n\t" + JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; @@ -33,13 +37,9 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" - ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t" - "2:\n\t" - ".pushsection __jump_table, \"aw\" \n\t" - _ASM_ALIGN "\n\t" - ".long 1b - ., %l[l_yes] - . \n\t" - _ASM_PTR "%c0 + %c1 - .\n\t" - ".popsection \n\t" + ".byte 0xe9 \n\t" + ".long %l[l_yes] - (. + 4) \n\t" + JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); return false; -- cgit v1.2.3 From f9510fa9caaf8229381d5f86ba0774bf1a6ca39b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:57 +0200 Subject: jump_label, x86: Improve error when we fail expected text There is only a single usage site left, remove the function and extend the print to include more information, like the expected text and the patch type. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.726939027@infradead.org --- arch/x86/kernel/jump_label.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 6a2eb62c85e6..638d3b9be0ad 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,37 +16,32 @@ #include #include -static void bug_at(const void *ip, int line) -{ - /* - * The location is not an op that we were expecting. - * Something went wrong. Crash the box, as something could be - * corrupting the kernel. - */ - pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line); - BUG(); -} - static const void * __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { const void *expect, *code; const void *addr, *dest; - int line; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); - if (type == JUMP_LABEL_JMP) { - expect = x86_nops[5]; line = __LINE__; - } else { - expect = code; line = __LINE__; - } + if (type == JUMP_LABEL_JMP) + expect = x86_nops[5]; + else + expect = code; - if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) - bug_at(addr, line); + if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) { + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) type:%d\n", + addr, addr, addr, expect, type); + BUG(); + } if (type == JUMP_LABEL_NOP) code = x86_nops[5]; -- cgit v1.2.3 From fa5e5dc39669b4427830c546ede8709323b8276c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:58 +0200 Subject: jump_label, x86: Introduce jump_entry_size() This allows architectures to have variable sized jumps. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.786777050@infradead.org --- arch/x86/include/asm/jump_label.h | 4 ++-- arch/x86/kernel/jump_label.c | 7 +++++++ include/linux/jump_label.h | 9 +++++++++ kernel/jump_label.c | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index dfdc2b1c17dd..d85802a00629 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -4,8 +4,6 @@ #define HAVE_JUMP_LABEL_BATCH -#define JUMP_LABEL_NOP_SIZE 5 - #include #include @@ -47,6 +45,8 @@ l_yes: return true; } +extern int arch_jump_entry_size(struct jump_entry *entry); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 638d3b9be0ad..a29eecc14c94 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -16,6 +16,13 @@ #include #include +#define JUMP_LABEL_NOP_SIZE JMP32_INSN_SIZE + +int arch_jump_entry_size(struct jump_entry *entry) +{ + return JMP32_INSN_SIZE; +} + static const void * __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) { diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 05f5554d860f..8c45f58292ac 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -176,6 +176,15 @@ static inline void jump_entry_set_init(struct jump_entry *entry) entry->key |= 2; } +static inline int jump_entry_size(struct jump_entry *entry) +{ +#ifdef JUMP_LABEL_NOP_SIZE + return JUMP_LABEL_NOP_SIZE; +#else + return arch_jump_entry_size(entry); +#endif +} + #endif #endif diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ba39fbb1f8e7..521cafcfcb69 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && - jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; -- cgit v1.2.3 From 001951bea748d3f675e1778f42b17290a8c551bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:33:59 +0200 Subject: jump_label, x86: Add variable length patching support This allows the patching to to emit 2 byte JMP/NOP instruction in addition to the 5 byte JMP/NOP we already did. This allows for more compact code. This code is not yet used, as we don't emit shorter code at compile time yet. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.846870383@infradead.org --- arch/x86/kernel/jump_label.c | 53 +++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index a29eecc14c94..190d810fa435 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -23,44 +23,63 @@ int arch_jump_entry_size(struct jump_entry *entry) return JMP32_INSN_SIZE; } -static const void * -__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type) +struct jump_label_patch { + const void *code; + int size; +}; + +static struct jump_label_patch +__jump_label_patch(struct jump_entry *entry, enum jump_label_type type) { - const void *expect, *code; + const void *expect, *code, *nop; const void *addr, *dest; + int size; addr = (void *)jump_entry_code(entry); dest = (void *)jump_entry_target(entry); - code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + size = arch_jump_entry_size(entry); + switch (size) { + case JMP8_INSN_SIZE: + code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + case JMP32_INSN_SIZE: + code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest); + nop = x86_nops[size]; + break; + + default: BUG(); + } if (type == JUMP_LABEL_JMP) - expect = x86_nops[5]; + expect = nop; else expect = code; - if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE)) { + if (memcmp(addr, expect, size)) { /* * The location is not an op that we were expecting. * Something went wrong. Crash the box, as something could be * corrupting the kernel. */ - pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) type:%d\n", - addr, addr, addr, expect, type); + pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n", + addr, addr, addr, expect, size, type); BUG(); } if (type == JUMP_LABEL_NOP) - code = x86_nops[5]; + code = nop; - return code; + return (struct jump_label_patch){.code = code, .size = size}; } static inline void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, int init) { - const void *opcode = __jump_label_set_jump_code(entry, type); + const struct jump_label_patch jlp = __jump_label_patch(entry, type); /* * As long as only a single processor is running and the code is still @@ -74,12 +93,11 @@ static inline void __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), opcode, - JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size); return; } - text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL); + text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -100,7 +118,7 @@ void arch_jump_label_transform(struct jump_entry *entry, bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) { - const void *opcode; + struct jump_label_patch jlp; if (system_state == SYSTEM_BOOTING) { /* @@ -111,9 +129,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } mutex_lock(&text_mutex); - opcode = __jump_label_set_jump_code(entry, type); - text_poke_queue((void *)jump_entry_code(entry), - opcode, JUMP_LABEL_NOP_SIZE, NULL); + jlp = __jump_label_patch(entry, type); + text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } -- cgit v1.2.3 From e7bf1ba97afdde75b0ef43e4bdb718bf843613f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:34:01 +0200 Subject: jump_label, x86: Emit short JMP Now that we can patch short JMP/NOP, allow the compiler/assembler to emit short JMP instructions. There is no way to have the assembler emit short NOPs based on the potential displacement, so leave those long for now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194157.967034497@infradead.org --- arch/x86/include/asm/jump_label.h | 3 +-- arch/x86/kernel/jump_label.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index d85802a00629..ef819e33cfc1 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -35,8 +35,7 @@ l_yes: static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" - ".byte 0xe9 \n\t" - ".long %l[l_yes] - (. + 4) \n\t" + "jmp %l[l_yes]\n\t" JUMP_TABLE_ENTRY : : "i" (key), "i" (branch) : : l_yes); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 190d810fa435..a762dc1c615e 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -15,12 +15,18 @@ #include #include #include +#include #define JUMP_LABEL_NOP_SIZE JMP32_INSN_SIZE int arch_jump_entry_size(struct jump_entry *entry) { - return JMP32_INSN_SIZE; + struct insn insn = {}; + + insn_decode_kernel(&insn, (void *)jump_entry_code(entry)); + BUG_ON(insn.length != 2 && insn.length != 5); + + return insn.length; } struct jump_label_patch { -- cgit v1.2.3 From ab3257042c26d0cd44793c741e2f89bf38b21fe8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 May 2021 21:34:05 +0200 Subject: jump_label, x86: Allow short NOPs Now that objtool is able to rewrite jump_label instructions, have the compiler emit a JMP, such that it can decide on the optimal encoding, and set jump_entry::key bit1 to indicate that objtool should rewrite the instruction to a matching NOP. For x86_64-allyesconfig this gives: jl\ NOP JMP short: 22997 124 long: 30874 90 IOW, we save (22997+124) * 3 bytes of kernel text in hotpaths. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210506194158.216763632@infradead.org --- arch/x86/include/asm/jump_label.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index ef819e33cfc1..0449b125d27f 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,6 +20,22 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" +#ifdef CONFIG_STACK_VALIDATION + +static __always_inline bool arch_static_branch(struct static_key *key, bool branch) +{ + asm_volatile_goto("1:" + "jmp %l[l_yes] # objtool NOPs this \n\t" + JUMP_TABLE_ENTRY + : : "i" (key), "i" (2 | branch) : : l_yes); + + return false; +l_yes: + return true; +} + +#else + static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" @@ -32,6 +48,8 @@ l_yes: return true; } +#endif /* STACK_VALIDATION */ + static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { asm_volatile_goto("1:" -- cgit v1.2.3 From c43426334b3169b6c9e6855483aa7384ff09fd33 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 May 2021 19:58:31 +0200 Subject: x86: Fix leftover comment typos Signed-off-by: Ingo Molnar --- arch/x86/hyperv/hv_init.c | 2 +- arch/x86/include/asm/sgx.h | 2 +- arch/x86/include/asm/stackprotector.h | 2 +- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index bb0ae4b5c00f..256ad0e34dd2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -623,7 +623,7 @@ bool hv_query_ext_cap(u64 cap_query) * output parameter to the hypercall below and so it should be * compatible with 'virt_to_phys'. Which means, it's address should be * directly mapped. Use 'static' to keep it compatible; stack variables - * can be virtually mapped, making them imcompatible with + * can be virtually mapped, making them incompatible with * 'virt_to_phys'. * Hypercall input/output addresses should also be 8-byte aligned. */ diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index 9c31e0ebc55b..05f3e21f01a7 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -13,7 +13,7 @@ /* * This file contains both data structures defined by SGX architecture and Linux * defined software data structures and functions. The two should not be mixed - * together for better readibility. The architectural definitions come first. + * together for better readability. The architectural definitions come first. */ /* The SGX specific CPUID function. */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index b6ffe58c70fa..24a8d6c4fb18 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -11,7 +11,7 @@ * The same segment is shared by percpu area and stack canary. On * x86_64, percpu symbols are zero based and %gs (64-bit) points to the * base of percpu area. The first occupant of the percpu area is always - * fixed_percpu_data which contains stack_canary at the approproate + * fixed_percpu_data which contains stack_canary at the appropriate * offset. On x86_32, the stack canary is just a regular percpu * variable. * diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d3d65545cb8b..7c4d0736a998 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -674,7 +674,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn) break; if (insn->addr_bytes != sizeof(unsigned long)) - return -EOPNOTSUPP; /* Don't support differnt size */ + return -EOPNOTSUPP; /* Don't support different size */ if (X86_MODRM_MOD(opcode) != 3) return -EOPNOTSUPP; /* TODO: support memory addressing */ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0144c40d09c7..5e60b00e8e50 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2374,7 +2374,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, - * being too agressive may unnecessarily kill the guest, and getting an + * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths. */ diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 95eeb5ac6a8a..e09cb1f97800 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1017,7 +1017,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (!is_shadow_present_pte(iter.old_spte)) { /* - * If SPTE has been forzen by another thread, just + * If SPTE has been frozen by another thread, just * give up and retry, avoiding unnecessary page table * allocation and free. */ -- cgit v1.2.3 From a554e740b66a83c7560b30e6b50bece37555ced3 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 22 Apr 2021 12:04:42 -0700 Subject: x86/boot/compressed: Enable -Wundef A discussion around -Wundef showed that there were still a few boolean Kconfigs where #if was used rather than #ifdef to guard different code. Kconfig doesn't define boolean configs, which can result in -Wundef warnings. arch/x86/boot/compressed/Makefile resets the CFLAGS used for this directory, and doesn't re-enable -Wundef as the top level Makefile does. If re-added, with RANDOMIZE_BASE and X86_NEED_RELOCS disabled, the following warnings are visible. arch/x86/boot/compressed/misc.h:82:5: warning: 'CONFIG_RANDOMIZE_BASE' is not defined, evaluates to 0 [-Wundef] ^ arch/x86/boot/compressed/misc.c:175:5: warning: 'CONFIG_X86_NEED_RELOCS' is not defined, evaluates to 0 [-Wundef] ^ Simply fix these and re-enable this warning for this directory. Suggested-by: Nathan Chancellor Suggested-by: Joe Perches Signed-off-by: Nick Desaulniers Signed-off-by: Ingo Molnar Acked-by: Kees Cook Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20210422190450.3903999-1-ndesaulniers@google.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/misc.c | 2 +- arch/x86/boot/compressed/misc.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 2a2975236c9e..431bf7f846c3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -30,6 +30,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIE +KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index dde042f64cca..743f13ea25c1 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -172,7 +172,7 @@ void __puthex(unsigned long value) } } -#if CONFIG_X86_NEED_RELOCS +#ifdef CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index e5612f035498..31139256859f 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -79,7 +79,7 @@ struct mem_vector { u64 size; }; -#if CONFIG_RANDOMIZE_BASE +#ifdef CONFIG_RANDOMIZE_BASE /* kaslr.c */ void choose_random_location(unsigned long input, unsigned long input_size, -- cgit v1.2.3 From 3743d55b289c203d8f77b7cd47c24926b9d186ae Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Sun, 25 Apr 2021 15:34:51 +0800 Subject: x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some AMD Ryzen generations has different calculation method on maximum performance. 255 is not for all ASICs, some specific generations should use 166 as the maximum performance. Otherwise, it will report incorrect frequency value like below: ~ → lscpu | grep MHz CPU MHz: 3400.000 CPU max MHz: 7228.3198 CPU min MHz: 2200.0000 [ mingo: Tidied up whitespace use. ] [ Alexander Monakov : fix 225 -> 255 typo. ] Fixes: 41ea667227ba ("x86, sched: Calculate frequency invariance for AMD systems") Fixes: 3c55e94c0ade ("cpufreq: ACPI: Extend frequency tables to cover boost frequencies") Reported-by: Jason Bagavatsingham Fixed-by: Alexander Monakov Reviewed-by: Rafael J. Wysocki Signed-off-by: Huang Rui Signed-off-by: Ingo Molnar Tested-by: Jason Bagavatsingham Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210425073451.2557394-1-ray.huang@amd.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211791 Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/cpu/amd.c | 16 ++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- drivers/cpufreq/acpi-cpufreq.c | 6 +++++- 4 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 154321d29050..556b2b17c3e2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -787,8 +787,10 @@ DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); +extern u32 amd_get_highest_perf(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } +static inline u32 amd_get_highest_perf(void) { return 0; } #endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..6d7b3b3ea80b 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1165,3 +1165,19 @@ void set_dr_addr_mask(unsigned long mask, int dr) break; } } + +u32 amd_get_highest_perf(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || + (c->x86_model >= 0x70 && c->x86_model < 0x80))) + return 166; + + if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || + (c->x86_model >= 0x40 && c->x86_model < 0x70))) + return 166; + + return 255; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0ad5214f598a..7770245cc7fa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2043,7 +2043,7 @@ static bool amd_set_max_freq_ratio(void) return false; } - highest_perf = perf_caps.highest_perf; + highest_perf = amd_get_highest_perf(); nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index d1bbc16fba4b..7e7450453714 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -646,7 +646,11 @@ static u64 get_max_boost_ratio(unsigned int cpu) return 0; } - highest_perf = perf_caps.highest_perf; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + highest_perf = amd_get_highest_perf(); + else + highest_perf = perf_caps.highest_perf; + nominal_perf = perf_caps.nominal_perf; if (!highest_perf || !nominal_perf) { -- cgit v1.2.3 From 41f45fb045bcc20e71eb705b361356e715682162 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 May 2021 13:41:41 +0200 Subject: x86/asm: Make valid on cross-builds as well Stephen Rothwell reported that the objtool cross-build breaks on non-x86 hosts: > tools/arch/x86/include/asm/asm.h:185:24: error: invalid register name for 'current_stack_pointer' > 185 | register unsigned long current_stack_pointer asm(_ASM_SP); > | ^~~~~~~~~~~~~~~~~~~~~ The PowerPC host obviously doesn't know much about x86 register names. Protect the kernel-specific bits of , so that it can be included by tooling and cross-built. Reported-by: Stephen Rothwell Reviewed-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/include/asm/asm.h | 4 ++++ tools/arch/x86/include/asm/asm.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 507a37a46027..3ad3da9a7d97 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -120,6 +120,8 @@ # define CC_OUT(c) [_cc_ ## c] "=qm" #endif +#ifdef __KERNEL__ + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ @@ -186,4 +188,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_ASM_H */ diff --git a/tools/arch/x86/include/asm/asm.h b/tools/arch/x86/include/asm/asm.h index 507a37a46027..3ad3da9a7d97 100644 --- a/tools/arch/x86/include/asm/asm.h +++ b/tools/arch/x86/include/asm/asm.h @@ -120,6 +120,8 @@ # define CC_OUT(c) [_cc_ ## c] "=qm" #endif +#ifdef __KERNEL__ + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ @@ -186,4 +188,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_ASM_H */ -- cgit v1.2.3 From d46f61b20b060f03b58fde170ee618f17dc6f99d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 May 2021 16:16:47 +0200 Subject: jump_label/x86: Remove unused JUMP_LABEL_NOP_SIZE JUMP_LABEL_NOP_SIZE is now unused, remove it. Fixes: 001951bea748 ("jump_label, x86: Add variable length patching support") Reported-by: Miroslav Benes Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/YJ00zxsvocDV5vLU@hirez.programming.kicks-ass.net --- arch/x86/kernel/jump_label.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index a762dc1c615e..674906fad43b 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -17,8 +17,6 @@ #include #include -#define JUMP_LABEL_NOP_SIZE JMP32_INSN_SIZE - int arch_jump_entry_size(struct jump_entry *entry) { struct insn insn = {}; -- cgit v1.2.3 From 28188cc461f6cf8b7d28de4f6df52014cc1d5e39 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 13 May 2021 09:39:04 -0700 Subject: x86/cpu: Fix core name for Sapphire Rapids Sapphire Rapids uses Golden Cove, not Willow Cove. Fixes: 53375a5a218e ("x86/cpu: Resort and comment Intel models") Signed-off-by: Andi Kleen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210513163904.3083274-1-ak@linux.intel.com --- arch/x86/include/asm/intel-family.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 955b06d6325a..27158436f322 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -102,7 +102,8 @@ #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ #define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */ -#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Willow Cove */ + +#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ -- cgit v1.2.3 From 3486d2c9be652a31033363bdd50391b0c8a8fe21 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 13 May 2021 09:32:46 +0200 Subject: clocksource/drivers/hyper-v: Re-enable VDSO_CLOCKMODE_HVCLOCK on X86 Mohammed reports (https://bugzilla.kernel.org/show_bug.cgi?id=213029) the commit e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") broke vDSO on x86. The problem appears to be that VDSO_CLOCKMODE_HVCLOCK is an enum value in 'enum vdso_clock_mode' and '#ifdef VDSO_CLOCKMODE_HVCLOCK' branch evaluates to false (it is not a define). Use a dedicated HAVE_VDSO_CLOCKMODE_HVCLOCK define instead. Fixes: e4ab4658f1cf ("clocksource/drivers/hyper-v: Handle vDSO differences inline") Reported-by: Mohammed Gamal Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210513073246.1715070-1-vkuznets@redhat.com --- arch/x86/include/asm/vdso/clocksource.h | 2 ++ drivers/clocksource/hyperv_timer.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vdso/clocksource.h b/arch/x86/include/asm/vdso/clocksource.h index 119ac8612d89..136e5e57cfe1 100644 --- a/arch/x86/include/asm/vdso/clocksource.h +++ b/arch/x86/include/asm/vdso/clocksource.h @@ -7,4 +7,6 @@ VDSO_CLOCKMODE_PVCLOCK, \ VDSO_CLOCKMODE_HVCLOCK +#define HAVE_VDSO_CLOCKMODE_HVCLOCK + #endif /* __ASM_VDSO_CLOCKSOURCE_H */ diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 977fd05ac35f..d6ece7bbce89 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -419,7 +419,7 @@ static void resume_hv_clock_tsc(struct clocksource *arg) hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); } -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); @@ -435,7 +435,7 @@ static struct clocksource hyperv_cs_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, -#ifdef VDSO_CLOCKMODE_HVCLOCK +#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK .enable = hv_cs_enable, .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, #else -- cgit v1.2.3 From 5b9fedb31e476693c90d8ee040e7d4c51b3e7cc4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 May 2021 14:39:56 +0200 Subject: quota: Disable quotactl_path syscall In commit fa8b90070a80 ("quota: wire up quotactl_path") we have wired up new quotactl_path syscall. However some people in LWN discussion have objected that the path based syscall is missing dirfd and flags argument which is mostly standard for contemporary path based syscalls. Indeed they have a point and after a discussion with Christian Brauner and Sascha Hauer I've decided to disable the syscall for now and update its API. Since there is no userspace currently using that syscall and it hasn't been released in any major release, we should be fine. CC: Christian Brauner CC: Sascha Hauer Link: https://lore.kernel.org/lkml/20210512153621.n5u43jsytbik4yze@wittgenstein Signed-off-by: Jan Kara --- arch/alpha/kernel/syscalls/syscall.tbl | 2 +- arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd32.h | 3 +-- arch/ia64/kernel/syscalls/syscall.tbl | 2 +- arch/m68k/kernel/syscalls/syscall.tbl | 2 +- arch/microblaze/kernel/syscalls/syscall.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n64.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sh/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 2 +- 17 files changed, 17 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 5622578742fd..3000a2e8ee21 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -482,7 +482,7 @@ 550 common process_madvise sys_process_madvise 551 common epoll_pwait2 sys_epoll_pwait2 552 common mount_setattr sys_mount_setattr -553 common quotactl_path sys_quotactl_path +# 553 reserved for quotactl_path 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index c7679d7db98b..28e03b5fec00 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -456,7 +456,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 7859749d6628..5dab69d2c22b 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -893,8 +893,7 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) -#define __NR_quotactl_path 443 -__SYSCALL(__NR_quotactl_path, sys_quotactl_path) +/* 443 is reserved for quotactl_path */ #define __NR_landlock_create_ruleset 444 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) #define __NR_landlock_add_rule 445 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 1ee8e736a48e..bb11fe4c875a 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -363,7 +363,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 0dd019dc2136..79c2d24c89dd 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -442,7 +442,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2ac716984ca2..b11395a20c20 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -448,7 +448,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 5e0096657251..9220909526f9 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -381,7 +381,7 @@ 440 n32 process_madvise sys_process_madvise 441 n32 epoll_pwait2 compat_sys_epoll_pwait2 442 n32 mount_setattr sys_mount_setattr -443 n32 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 9974f5f8e49b..9cd1c34f31b5 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -357,7 +357,7 @@ 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 442 n64 mount_setattr sys_mount_setattr -443 n64 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 39d6e71e57b6..d560c467a8c6 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -430,7 +430,7 @@ 440 o32 process_madvise sys_process_madvise 441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 o32 mount_setattr sys_mount_setattr -443 o32 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 5ac80b83d745..aabc37f8cae3 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -440,7 +440,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 2e68fbb57cc6..8f052ff4058c 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -522,7 +522,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 7e4a2aba366d..0690263df1dd 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -445,7 +445,7 @@ 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index f47a0dc55445..0b91499ebdcf 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -445,7 +445,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b9e1c0e735b7..e34cc30ef22c 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -488,7 +488,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 28a1423ce32e..4bbc267fb36b 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,7 +447,7 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr -443 i386 quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ecd551b08d05..ce18119ea0d0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,7 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 9d76d433d3d6..fd2f30227d96 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -413,7 +413,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -443 common quotactl_path sys_quotactl_path +# 443 reserved for quotactl_path 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self -- cgit v1.2.3 From 14fad24d0520c65ecfc2eebe8e4cf25ca02f19cf Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 13 May 2021 15:33:41 +0200 Subject: x86/acpi: Switch to pr_xxx log functions Switching to pr_debug et al has two benefits: - We don't have to add PREFIX to each log statement - Debug output is suppressed except DEBUG is defined or dynamic debugging is enabled for the respective code piece. In addition ensure that longer messages aren't split to multiple lines in source code, checkpatch complains otherwise. Signed-off-by: Heiner Kallweit Acked-by: Pavel Machek Reviewed-by: Ingo Molnar Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 118 ++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e90310cbe73a..e55e0c1fad8c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -5,6 +5,7 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh * Copyright (C) 2001 Jun Nakajima */ +#define pr_fmt(fmt) "ACPI: " fmt #include #include @@ -42,8 +43,6 @@ EXPORT_SYMBOL(acpi_disabled); # include #endif /* X86 */ -#define PREFIX "ACPI: " - int acpi_noirq; /* skip ACPI IRQ initialization */ static int acpi_nobgrt; /* skip ACPI BGRT */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ @@ -130,15 +129,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) madt = (struct acpi_table_madt *)table; if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + pr_warn("Unable to map MADT\n"); return -ENODEV; } if (madt->address) { acpi_lapic_addr = (u64) madt->address; - printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", - madt->address); + pr_debug("Local APIC address 0x%08x\n", madt->address); } default_acpi_madt_oem_check(madt->header.oem_id, @@ -161,7 +159,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) int cpu; if (id >= MAX_LOCAL_APIC) { - printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + pr_info("skipped apicid that is too big\n"); return -EINVAL; } @@ -213,13 +211,13 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) */ if (!apic->apic_id_valid(apic_id)) { if (enabled) - pr_warn(PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); return 0; } acpi_register_lapic(apic_id, processor->uid, enabled); #else - printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + pr_warn("x2apic entry ignored\n"); #endif return 0; @@ -306,7 +304,7 @@ acpi_parse_x2apic_nmi(union acpi_subtable_headers *header, acpi_table_print_madt_entry(&header->common); if (x2apic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } @@ -324,7 +322,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e acpi_table_print_madt_entry(&header->common); if (lapic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + pr_warn("NMI not connected to LINT 1!\n"); return 0; } @@ -514,14 +512,14 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header, if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 override ignored.\n"); + pr_warn("BIOS IRQ0 override ignored.\n"); return 0; } if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; - printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } } @@ -597,7 +595,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) if (old == new) return; - printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + pr_warn("setting ELCR to %04x (from %04x)\n", new, old); outb(new, 0x4d0); outb(new >> 8, 0x4d1); } @@ -754,7 +752,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { - pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); + pr_info("Unable to map lapic to logical cpu number\n"); return cpu; } @@ -870,8 +868,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table; if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { - printk(KERN_WARNING PREFIX "HPET timers must be located in " - "memory.\n"); + pr_warn("HPET timers must be located in memory.\n"); return -1; } @@ -883,9 +880,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * want to allocate a resource there. */ if (!hpet_address) { - printk(KERN_WARNING PREFIX - "HPET id: %#x base: %#lx is invalid\n", - hpet_tbl->id, hpet_address); + pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address); return 0; } #ifdef CONFIG_X86_64 @@ -896,21 +891,17 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) */ if (hpet_address == 0xfed0000000000000UL) { if (!hpet_force_user) { - printk(KERN_WARNING PREFIX "HPET id: %#x " - "base: 0xfed0000000000000 is bogus\n " - "try hpet=force on the kernel command line to " - "fix it up to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n", + hpet_tbl->id); hpet_address = 0; return 0; } - printk(KERN_WARNING PREFIX - "HPET id: %#x base: 0xfed0000000000000 fixed up " - "to 0xfed00000.\n", hpet_tbl->id); + pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n", + hpet_tbl->id); hpet_address >>= 32; } #endif - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); + pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); /* * Allocate and initialize the HPET firmware resource for adding into @@ -955,24 +946,24 @@ late_initcall(hpet_insert_resource); static int __init acpi_parse_fadt(struct acpi_table_header *table) { if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) { - pr_debug("ACPI: no legacy devices present\n"); + pr_debug("no legacy devices present\n"); x86_platform.legacy.devices.pnpbios = 0; } if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) && x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) { - pr_debug("ACPI: i8042 controller is absent\n"); + pr_debug("i8042 controller is absent\n"); x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { - pr_debug("ACPI: not registering RTC platform device\n"); + pr_debug("not registering RTC platform device\n"); x86_platform.legacy.rtc = 0; } if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) { - pr_debug("ACPI: probing for VGA not safe\n"); + pr_debug("probing for VGA not safe\n"); x86_platform.legacy.no_vga = 1; } @@ -997,8 +988,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; } if (pmtmr_ioport) - printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", - pmtmr_ioport); + pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport); #endif return 0; } @@ -1024,8 +1014,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC address override entry\n"); + pr_err("Error parsing LAPIC address override entry\n"); return count; } @@ -1057,8 +1046,7 @@ static int __init acpi_parse_madt_lapic_entries(void) sizeof(struct acpi_table_madt), madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); if (ret < 0) { - printk(KERN_ERR PREFIX - "Error parsing LAPIC/X2APIC entries\n"); + pr_err("Error parsing LAPIC/X2APIC entries\n"); return ret; } @@ -1066,11 +1054,11 @@ static int __init acpi_parse_madt_lapic_entries(void) x2count = madt_proc[1].count; } if (!count && !x2count) { - printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + pr_err("No LAPIC entries present\n"); /* TBD: Cleanup to allow fallback to MPS */ return -ENODEV; } else if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + pr_err("Error parsing LAPIC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1080,7 +1068,7 @@ static int __init acpi_parse_madt_lapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); if (count < 0 || x2count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + pr_err("Error parsing LAPIC NMI entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1139,7 +1127,7 @@ static void __init mp_config_acpi_legacy_irqs(void) } if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + pr_debug("ACPI: IRQ%d used by override.\n", i); continue; /* IRQ already used */ } @@ -1179,26 +1167,24 @@ static int __init acpi_parse_madt_ioapic_entries(void) * if "noapic" boot option, don't look for IO-APICs */ if (skip_ioapic_setup) { - printk(KERN_INFO PREFIX "Skipping IOAPIC probe " - "due to 'noapic' option.\n"); + pr_info("Skipping IOAPIC probe due to 'noapic' option.\n"); return -ENODEV; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, MAX_IO_APICS); if (!count) { - printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + pr_err("No IOAPIC entries present\n"); return -ENODEV; } else if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + pr_err("Error parsing IOAPIC entry\n"); return count; } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX - "Error parsing interrupt source overrides entry\n"); + pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1218,7 +1204,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, nr_irqs); if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ return count; } @@ -1251,8 +1237,7 @@ static void __init early_acpi_process_madt(void) /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } @@ -1289,8 +1274,7 @@ static void __init acpi_process_madt(void) /* * Dell Precision Workstation 410, 610 come here. */ - printk(KERN_ERR PREFIX - "Invalid BIOS MADT, disabling ACPI\n"); + pr_err("Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } } else { @@ -1300,8 +1284,7 @@ static void __init acpi_process_madt(void) * Boot with "acpi=off" to use MPS on such a system. */ if (smp_found_config) { - printk(KERN_WARNING PREFIX - "No APIC-table, disabling MPS\n"); + pr_warn("No APIC-table, disabling MPS\n"); smp_found_config = 0; } } @@ -1311,11 +1294,9 @@ static void __init acpi_process_madt(void) * processors, where MPS only supports physical. */ if (acpi_lapic && acpi_ioapic) - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " - "information\n"); + pr_info("Using ACPI (MADT) for SMP configuration information\n"); else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) " - "configuration information\n"); + pr_info("Using ACPI for processor (LAPIC) configuration information\n"); #endif return; } @@ -1323,8 +1304,7 @@ static void __init acpi_process_madt(void) static int __init disable_acpi_irq(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", - d->ident); + pr_notice("%s detected: force use of acpi=noirq\n", d->ident); acpi_noirq_set(); } return 0; @@ -1333,8 +1313,7 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d) static int __init disable_acpi_pci(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", - d->ident); + pr_notice("%s detected: force use of pci=noacpi\n", d->ident); acpi_disable_pci(); } return 0; @@ -1343,11 +1322,10 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d) static int __init dmi_disable_acpi(const struct dmi_system_id *d) { if (!acpi_force) { - printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + pr_notice("%s detected: acpi off\n", d->ident); disable_acpi(); } else { - printk(KERN_NOTICE - "Warning: DMI blacklist says broken, but acpi forced\n"); + pr_notice("Warning: DMI blacklist says broken, but acpi forced\n"); } return 0; } @@ -1574,9 +1552,9 @@ int __init early_acpi_boot_init(void) */ if (acpi_blacklisted()) { if (acpi_force) { - printk(KERN_WARNING PREFIX "acpi=force override\n"); + pr_warn("acpi=force override\n"); } else { - printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + pr_warn("Disabling ACPI support\n"); disable_acpi(); return 1; } @@ -1692,9 +1670,7 @@ int __init acpi_mps_check(void) #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ if (acpi_disabled || acpi_noirq) { - printk(KERN_WARNING "MPS support code is not built-in.\n" - "Using acpi=off or acpi=noirq or pci=noacpi " - "may have problem\n"); + pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; } #endif -- cgit v1.2.3 From fea63d54f7a3e74f8ab489a8b82413a29849a594 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 May 2021 12:42:32 -0500 Subject: x86/sev-es: Move sev_es_put_ghcb() in prep for follow on patch Move the location of sev_es_put_ghcb() in preparation for an update to it in a follow-on patch. This will better highlight the changes being made to the function. No functional change. Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/8c07662ec17d3d82e5c53841a1d9e766d3bdbab6.1621273353.git.thomas.lendacky@amd.com --- arch/x86/kernel/sev.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 9578c82832aa..45e212675811 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -221,24 +221,6 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) return ghcb; } -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - data->ghcb_active = false; - } -} - /* Needed in vc_early_forward_exception */ void do_early_exception(struct pt_regs *regs, int trapnr); @@ -461,6 +443,24 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" +static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +{ + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + if (state->ghcb) { + /* Restore GHCB from Backup */ + *ghcb = *state->ghcb; + data->backup_ghcb_active = false; + state->ghcb = NULL; + } else { + data->ghcb_active = false; + } +} + void noinstr __sev_es_nmi_complete(void) { struct ghcb_state state; -- cgit v1.2.3 From a50c5bebc99c525e7fbc059988c6a5ab8680cb76 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 17 May 2021 12:42:33 -0500 Subject: x86/sev-es: Invalidate the GHCB after completing VMGEXIT Since the VMGEXIT instruction can be issued from userspace, invalidate the GHCB after performing VMGEXIT processing in the kernel. Invalidation is only required after userspace is available, so call vc_ghcb_invalidate() from sev_es_put_ghcb(). Update vc_ghcb_invalidate() to additionally clear the GHCB exit code so that it is always presented as 0 when VMGEXIT has been issued by anything else besides the kernel. Fixes: 0786138c78e79 ("x86/sev-es: Add a Runtime #VC Exception Handler") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/5a8130462e4f0057ee1184509cd056eedd78742b.1621273353.git.thomas.lendacky@amd.com --- arch/x86/kernel/sev-shared.c | 1 + arch/x86/kernel/sev.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 6ec8b3bfd76e..9f90f460a28c 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -63,6 +63,7 @@ static bool sev_es_negotiate_protocol(void) static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) { + ghcb->save.sw_exit_code = 0; memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 45e212675811..4fa111becc93 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -457,6 +457,11 @@ static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) data->backup_ghcb_active = false; state->ghcb = NULL; } else { + /* + * Invalidate the GHCB so a VMGEXIT instruction issued + * from userspace won't appear to be valid. + */ + vc_ghcb_invalidate(ghcb); data->ghcb_active = false; } } -- cgit v1.2.3 From 3317c26a4b413b41364f2c4b83c778c6aba1576d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 30 Apr 2021 13:22:46 +0800 Subject: perf/x86: Avoid touching LBR_TOS MSR for Arch LBR The Architecture LBR does not have MSR_LBR_TOS (0x000001c9). In a guest that should support Architecture LBR, check_msr() will be a non-related check for the architecture MSR 0x0 (IA32_P5_MC_ADDR) that is also not supported by KVM. The failure will cause x86_pmu.lbr_nr = 0, thereby preventing the initialization of the guest Arch LBR. Fix it by avoiding this extraneous check in intel_pmu_init() for Arch LBR. Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR") Signed-off-by: Like Xu [peterz: simpler still] Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210430052247.3079672-1-like.xu@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2521d03de5e0..e28892270c58 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6253,7 +6253,7 @@ __init int intel_pmu_init(void) * Check all LBT MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ - if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) x86_pmu.lbr_nr = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && -- cgit v1.2.3 From 488e13a489e9707a7e81e1991fdd1f20c0f04689 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 30 Apr 2021 13:22:47 +0800 Subject: perf/x86/lbr: Remove cpuc->lbr_xsave allocation from atomic context If the kernel is compiled with the CONFIG_LOCKDEP option, the conditional might_sleep_if() deep in kmem_cache_alloc() will generate the following trace, and potentially cause a deadlock when another LBR event is added: [] BUG: sleeping function called from invalid context at include/linux/sched/mm.h:196 [] Call Trace: [] kmem_cache_alloc+0x36/0x250 [] intel_pmu_lbr_add+0x152/0x170 [] x86_pmu_add+0x83/0xd0 Make it symmetric with the release_lbr_buffers() call and mirror the existing DS buffers. Fixes: c085fb8774 ("perf/x86/intel/lbr: Support XSAVES for arch LBR read") Signed-off-by: Like Xu [peterz: simplified] Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/20210430052247.3079672-2-like.xu@linux.intel.com --- arch/x86/events/core.c | 6 ++++-- arch/x86/events/intel/lbr.c | 26 ++++++++++++++++++++------ arch/x86/events/perf_event.h | 6 ++++++ 3 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e509325c2c3..8f71dd72ef95 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -396,10 +396,12 @@ int x86_reserve_hardware(void) if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { - if (!reserve_pmc_hardware()) + if (!reserve_pmc_hardware()) { err = -EBUSY; - else + } else { reserve_ds_buffers(); + reserve_lbr_buffers(); + } } if (!err) atomic_inc(&pmc_refcount); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 76dbab6ac9fb..4409d2cccfda 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -658,7 +658,6 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { - struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu.lbr_nr) @@ -696,11 +695,6 @@ void intel_pmu_lbr_add(struct perf_event *event) perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); - - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && - kmem_cache && !cpuc->lbr_xsave && - (cpuc->lbr_users != cpuc->lbr_pebs_users)) - cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); } void release_lbr_buffers(void) @@ -722,6 +716,26 @@ void release_lbr_buffers(void) } } +void reserve_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + kmem_cache = x86_get_pmu(cpu)->task_ctx_cache; + if (!kmem_cache || cpuc->lbr_xsave) + continue; + + cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL, + cpu_to_node(cpu)); + } +} + void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 27fa85e7d4fd..ad87cb36f7c8 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1244,6 +1244,8 @@ void reserve_ds_buffers(void); void release_lbr_buffers(void); +void reserve_lbr_buffers(void); + extern struct event_constraint bts_constraint; extern struct event_constraint vlbr_constraint; @@ -1393,6 +1395,10 @@ static inline void release_lbr_buffers(void) { } +static inline void reserve_lbr_buffers(void) +{ +} + static inline int intel_pmu_init(void) { return 0; -- cgit v1.2.3 From b1efd0ff4bd16e8bb8607ba566b03f2024a830bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 10 May 2021 23:29:25 +0200 Subject: x86/cpu: Init AP exception handling from cpu_init_secondary() SEV-ES guests require properly setup task register with which the TSS descriptor in the GDT can be located so that the IST-type #VC exception handler which they need to function properly, can be executed. This setup needs to happen before attempting to load microcode in ucode_cpu_init() on secondary CPUs which can cause such #VC exceptions. Simplify the machinery by running that exception setup from a new function cpu_init_secondary() and explicitly call cpu_init_exception_handling() for the boot CPU before cpu_init(). The latter prepares for fixing and simplifying the exception/IST setup on the boot CPU. There should be no functional changes resulting from this patch. [ tglx: Reworked it so cpu_init_exception_handling() stays seperate ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Lai Jiangshan Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/87k0o6gtvu.ffs@nanos.tec.linutronix.de --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 28 +++++++++++++++------------- arch/x86/kernel/smpboot.c | 3 +-- arch/x86/kernel/traps.c | 4 +--- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 556b2b17c3e2..364d0e42e280 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -663,6 +663,7 @@ extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); +extern void cpu_init_secondary(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a1b756c49a93..212e8bc070da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1938,13 +1938,12 @@ void cpu_init_exception_handling(void) /* * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. + * initialized (naturally) in the bootstrap process, such as the GDT. We + * reload it nevertheless, this function acts as a 'CPU state barrier', + * nothing should get across. */ void cpu_init(void) { - struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct task_struct *cur = current; int cpu = raw_smp_processor_id(); @@ -1957,8 +1956,6 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif - setup_getcpu(cpu); - pr_debug("Initializing CPU#%d\n", cpu); if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) || @@ -1970,7 +1967,6 @@ void cpu_init(void) * and set up the GDT descriptor: */ switch_to_new_gdt(cpu); - load_current_idt(); if (IS_ENABLED(CONFIG_X86_64)) { loadsegment(fs, 0); @@ -1990,12 +1986,6 @@ void cpu_init(void) initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, cur); - /* Initialize the TSS. */ - tss_setup_ist(tss); - tss_setup_io_bitmap(tss); - set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); - - load_TR_desc(); /* * sp0 points to the entry trampoline stack regardless of what task * is running. @@ -2017,6 +2007,18 @@ void cpu_init(void) load_fixmap_gdt(cpu); } +#ifdef CONFIG_SMP +void cpu_init_secondary(void) +{ + /* + * Relies on the BP having set-up the IDT tables, which are loaded + * on this CPU in cpu_init_exception_handling(). + */ + cpu_init_exception_handling(); + cpu_init(); +} +#endif + /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7770245cc7fa..2ed45b036629 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -232,8 +232,7 @@ static void notrace start_secondary(void *unused) load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - cpu_init_exception_handling(); - cpu_init(); + cpu_init_secondary(); rcu_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); preempt_disable(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 853ea7a80806..41f7dc492803 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1162,9 +1162,7 @@ void __init trap_init(void) idt_setup_traps(); - /* - * Should be a barrier for any external CPU state: - */ + cpu_init_exception_handling(); cpu_init(); idt_setup_ist_traps(); -- cgit v1.2.3 From 1dcc917a0eed934c522d93bb05a9a7bb3c54f96c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 May 2021 13:02:12 +0200 Subject: x86/idt: Rework IDT setup for boot CPU A basic IDT setup for the boot CPU has to be done before invoking cpu_init() because that might trigger #GP when accessing certain MSRs. This setup cannot install the IST variants on 64-bit because the TSS setup which is required for ISTs to work happens in cpu_init(). That leaves a theoretical window where a NMI would invoke the ASM entry point which relies on IST being enabled on the kernel stack which is undefined behaviour. This setup logic has never worked correctly, but on the other hand a NMI hitting the boot CPU before it has fully set up the IDT would be fatal anyway. So the small window between the wrong NMI gate and the IST based NMI gate is not really adding a substantial amount of risk. But the setup logic is nevertheless more convoluted than necessary. The recent separation of the TSS setup into a separate function to ensure that setup so it can setup TSS first, then initialize IDT with the IST variants before invoking cpu_init() and get rid of the post cpu_init() IST setup. Move the invocation of cpu_init_exception_handling() ahead of idt_setup_traps() and merge the IST setup into the default setup table. Reported-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Reviewed-by: Lai Jiangshan Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210507114000.569244755@linutronix.de --- arch/x86/include/asm/desc.h | 2 -- arch/x86/kernel/idt.c | 40 ++++++++++++---------------------------- arch/x86/kernel/traps.c | 7 +++---- 3 files changed, 15 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 476082a83d1c..96021e9bd202 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -421,10 +421,8 @@ extern bool idt_is_f00f_address(unsigned long address); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); -extern void idt_setup_ist_traps(void); #else static inline void idt_setup_early_pf(void) { } -static inline void idt_setup_ist_traps(void) { } #endif extern void idt_invalidate(void *addr); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d552f177eca0..6cce6047fa12 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -35,12 +35,16 @@ #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) +#ifdef CONFIG_X86_64 /* * Interrupt gate with interrupt stack. The _ist index is the index in * the tss.ist[] array, but for the descriptor it needs to start at 1. */ #define ISTG(_vector, _addr, _ist) \ G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) +#else +#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr) +#endif /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -74,7 +78,7 @@ static const __initconst struct idt_data early_idts[] = { */ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), - INTG(X86_TRAP_NMI, asm_exc_nmi), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), @@ -91,12 +95,16 @@ static const __initconst struct idt_data def_idts[] = { #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, asm_exc_double_fault), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #endif - INTG(X86_TRAP_DB, asm_exc_debug), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, asm_exc_machine_check), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif + +#ifdef CONFIG_AMD_MEM_ENCRYPT + ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), #endif SYSG(X86_TRAP_OF, asm_exc_overflow), @@ -221,22 +229,6 @@ static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, asm_exc_page_fault), }; -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), -#endif -#ifdef CONFIG_AMD_MEM_ENCRYPT - ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC), -#endif -}; - /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -254,14 +246,6 @@ void __init idt_setup_early_pf(void) idt_setup_from_table(idt_table, early_pf_idts, ARRAY_SIZE(early_pf_idts), true); } - -/** - * idt_setup_ist_traps - Initialize the idt table with traps using IST - */ -void __init idt_setup_ist_traps(void) -{ - idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); -} #endif static void __init idt_map_in_cea(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 41f7dc492803..ed540e09a399 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1160,10 +1160,9 @@ void __init trap_init(void) /* Init GHCB memory pages when running as an SEV-ES guest */ sev_es_init_vc_handling(); - idt_setup_traps(); - + /* Initialize TSS before setting up traps so ISTs work */ cpu_init_exception_handling(); + /* Setup traps as cpu_init() might #GP */ + idt_setup_traps(); cpu_init(); - - idt_setup_ist_traps(); } -- cgit v1.2.3 From ef4ae6e4413159d2329a172c12e9274e2cb0a3a8 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 19 Apr 2021 21:49:56 +0000 Subject: x86/bus_lock: Set rate limit for bus lock A bus lock can be thousands of cycles slower than atomic operation within one cache line. It also disrupts performance on other cores. Malicious users can generate multiple bus locks to degrade the whole system performance. The current mitigation is to kill the offending process, but for certain scenarios it's desired to identify and throttle the offending application. Add a system wide rate limit for bus locks. When the system detects bus locks at a rate higher than N/sec (where N can be set by the kernel boot argument in the range [1..1000]) any task triggering a bus lock will be forced to sleep for at least 20ms until the overall system rate of bus locks drops below the threshold. Signed-off-by: Fenghua Yu Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lore.kernel.org/r/20210419214958.4035512-3-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8adffc17fa8b..7c23f0397390 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ enum split_lock_detect_state { sld_off = 0, sld_warn, sld_fatal, + sld_ratelimit, }; /* @@ -997,13 +999,30 @@ static const struct { { "off", sld_off }, { "warn", sld_warn }, { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, }; +static struct ratelimit_state bld_ratelimit; + static inline bool match_option(const char *arg, int arglen, const char *opt) { - int len = strlen(opt); + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } - return len == arglen && !strncmp(arg, opt, len); + return len == arglen; } static bool split_lock_verify_msr(bool on) @@ -1082,6 +1101,15 @@ static void sld_update_msr(bool on) static void split_lock_init(void) { + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + if (cpu_model_supports_sld) split_lock_verify_msr(sld_state != sld_off); } @@ -1154,6 +1182,12 @@ void handle_bus_lock(struct pt_regs *regs) switch (sld_state) { case sld_off: break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; case sld_warn: pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", current->comm, current->pid, regs->ip); @@ -1259,6 +1293,10 @@ static void sld_state_show(void) " from non-WB" : ""); } break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; } } -- cgit v1.2.3 From add0b32ef9146a8559a60aed54c37692a5f9d34f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 17:06:01 -0500 Subject: siginfo: Move si_trapno inside the union inside _si_fault It turns out that linux uses si_trapno very sparingly, and as such it can be considered extra information for a very narrow selection of signals, rather than information that is present with every fault reported in siginfo. As such move si_trapno inside the union inside of _si_fault. This results in no change in placement, and makes it eaiser to extend _si_fault in the future as this reduces the number of special cases. In particular with si_trapno included in the union it is no longer a concern that the union must be pointer aligned on most architectures because the union follows immediately after si_addr which is a pointer. This change results in a difference in siginfo field placement on sparc and alpha for the fields si_addr_lsb, si_lower, si_upper, si_pkey, and si_perf. These architectures do not implement the signals that would use si_addr_lsb, si_lower, si_upper, si_pkey, and si_perf. Further these architecture have not yet implemented the userspace that would use si_perf. The point of this change is in fact to correct these placement issues before sparc or alpha grow userspace that cares. This change was discussed[1] and the agreement is that this change is currently safe. [1]: https://lkml.kernel.org/r/CAK8P3a0+uKYwL1NhY6Hvtieghba2hKYGD6hcKx5n8=4Gtt+pHA@mail.gmail.com Acked-by: Marco Elver v1: https://lkml.kernel.org/r/m1tunns7yf.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210505141101.11519-5-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/signal_compat.c | 3 +++ include/linux/compat.h | 5 ++--- include/uapi/asm-generic/siginfo.h | 7 ++----- kernel/signal.c | 1 + 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 0e5d0a7e203b..a9fcabd8a5e5 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -127,6 +127,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); diff --git a/include/linux/compat.h b/include/linux/compat.h index f0d2dd35d408..6af7bef15e94 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -214,12 +214,11 @@ typedef struct compat_siginfo { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { compat_uptr_t _addr; /* faulting insn/memory ref. */ -#ifdef __ARCH_SI_TRAPNO - int _trapno; /* TRAP # which caused the signal */ -#endif #define __COMPAT_ADDR_BND_PKEY_PAD (__alignof__(compat_uptr_t) < sizeof(short) ? \ sizeof(short) : __alignof__(compat_uptr_t)) union { + /* used on alpha and sparc */ + int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 03d6f6d2c1fe..e663bf117b46 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -63,9 +63,6 @@ union __sifields { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS, SIGTRAP, SIGEMT */ struct { void __user *_addr; /* faulting insn/memory ref. */ -#ifdef __ARCH_SI_TRAPNO - int _trapno; /* TRAP # which caused the signal */ -#endif #ifdef __ia64__ int _imm; /* immediate value for "break" */ unsigned int _flags; /* see ia64 si_flags */ @@ -75,6 +72,8 @@ union __sifields { #define __ADDR_BND_PKEY_PAD (__alignof__(void *) < sizeof(short) ? \ sizeof(short) : __alignof__(void *)) union { + /* used on alpha and sparc */ + int _trapno; /* TRAP # which caused the signal */ /* * used when si_code=BUS_MCEERR_AR or * used when si_code=BUS_MCEERR_AO @@ -150,9 +149,7 @@ typedef struct siginfo { #define si_int _sifields._rt._sigval.sival_int #define si_ptr _sifields._rt._sigval.sival_ptr #define si_addr _sifields._sigfault._addr -#ifdef __ARCH_SI_TRAPNO #define si_trapno _sifields._sigfault._trapno -#endif #define si_addr_lsb _sifields._sigfault._addr_lsb #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper diff --git a/kernel/signal.c b/kernel/signal.c index c3017aa8024a..65888aec65a0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4607,6 +4607,7 @@ static inline void siginfo_buildtime_checks(void) /* sigfault */ CHECK_OFFSET(si_addr); + CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); -- cgit v1.2.3 From 0683b53197b55343a166f1507086823030809a19 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 2 May 2021 17:28:31 -0500 Subject: signal: Deliver all of the siginfo perf data in _perf Don't abuse si_errno and deliver all of the perf data in _perf member of siginfo_t. Note: The data field in the perf data structures in a u64 to allow a pointer to be encoded without needed to implement a 32bit and 64bit version of the same structure. There already exists a 32bit and 64bit versions siginfo_t, and the 32bit version can not include a 64bit member as it only has 32bit alignment. So unsigned long is used in siginfo_t instead of a u64 as unsigned long can encode a pointer on all architectures linux supports. v1: https://lkml.kernel.org/r/m11rarqqx2.fsf_-_@fess.ebiederm.org v2: https://lkml.kernel.org/r/20210503203814.25487-10-ebiederm@xmission.com v3: https://lkml.kernel.org/r/20210505141101.11519-11-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20210517195748.8880-4-ebiederm@xmission.com Reviewed-by: Marco Elver Signed-off-by: "Eric W. Biederman" --- arch/m68k/kernel/signal.c | 3 ++- arch/x86/kernel/signal_compat.c | 6 ++++-- fs/signalfd.c | 3 ++- include/linux/compat.h | 5 ++++- include/uapi/asm-generic/siginfo.h | 8 ++++++-- include/uapi/linux/perf_event.h | 2 +- include/uapi/linux/signalfd.h | 4 ++-- kernel/signal.c | 21 +++++++++++++-------- .../testing/selftests/perf_events/sigtrap_threads.c | 14 +++++++------- 9 files changed, 41 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index a4b7ee1df211..8f215e79e70e 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -623,7 +623,8 @@ static inline void siginfo_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12); /* _sigfault._perf */ - BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x14); /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a9fcabd8a5e5..06743ec054d2 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -141,8 +141,10 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); - BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); - BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14); CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); diff --git a/fs/signalfd.c b/fs/signalfd.c index e87e59581653..373df2f12415 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -134,7 +134,8 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_PERF_EVENT: new.ssi_addr = (long) kinfo->si_addr; - new.ssi_perf = kinfo->si_perf; + new.ssi_perf_type = kinfo->si_perf_type; + new.ssi_perf_data = kinfo->si_perf_data; break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; diff --git a/include/linux/compat.h b/include/linux/compat.h index 6af7bef15e94..a27fffaae121 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -236,7 +236,10 @@ typedef struct compat_siginfo { u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - compat_ulong_t _perf; + struct { + compat_ulong_t _data; + u32 _type; + } _perf; }; } _sigfault; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index e663bf117b46..5a3c221f4c9d 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,7 +91,10 @@ union __sifields { __u32 _pkey; } _addr_pkey; /* used when si_code=TRAP_PERF */ - unsigned long _perf; + struct { + unsigned long _data; + __u32 _type; + } _perf; }; } _sigfault; @@ -154,7 +157,8 @@ typedef struct siginfo { #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper #define si_pkey _sifields._sigfault._addr_pkey._pkey -#define si_perf _sifields._sigfault._perf +#define si_perf_data _sifields._sigfault._perf._data +#define si_perf_type _sifields._sigfault._perf._type #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e54e639248c8..7b14753b3d38 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -464,7 +464,7 @@ struct perf_event_attr { /* * User provided data if sigtrap=1, passed back to user via - * siginfo_t::si_perf, e.g. to permit user to identify the event. + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. */ __u64 sig_data; }; diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h index 7e333042c7e3..e78dddf433fc 100644 --- a/include/uapi/linux/signalfd.h +++ b/include/uapi/linux/signalfd.h @@ -39,8 +39,8 @@ struct signalfd_siginfo { __s32 ssi_syscall; __u64 ssi_call_addr; __u32 ssi_arch; - __u32 __pad3; - __u64 ssi_perf; + __u32 ssi_perf_type; + __u64 ssi_perf_data; /* * Pad strcture to 128 bytes. Remember to update the diff --git a/kernel/signal.c b/kernel/signal.c index 3a18d13c39b2..dca53515ae3f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1768,11 +1768,13 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) struct kernel_siginfo info; clear_siginfo(&info); - info.si_signo = SIGTRAP; - info.si_errno = type; - info.si_code = TRAP_PERF; - info.si_addr = addr; - info.si_perf = sig_data; + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_PERF; + info.si_addr = addr; + info.si_perf_data = sig_data; + info.si_perf_type = type; + return force_sig_info(&info); } @@ -3356,7 +3358,8 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, break; case SIL_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3432,7 +3435,8 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, break; case SIL_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4615,7 +4619,8 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); - CHECK_OFFSET(si_perf); + CHECK_OFFSET(si_perf_data); + CHECK_OFFSET(si_perf_type); /* sigpoll */ CHECK_OFFSET(si_band); diff --git a/tools/testing/selftests/perf_events/sigtrap_threads.c b/tools/testing/selftests/perf_events/sigtrap_threads.c index 78ddf5e11625..8e83cf91513a 100644 --- a/tools/testing/selftests/perf_events/sigtrap_threads.c +++ b/tools/testing/selftests/perf_events/sigtrap_threads.c @@ -43,7 +43,7 @@ static struct { siginfo_t first_siginfo; /* First observed siginfo_t. */ } ctx; -/* Unique value to check si_perf is correctly set from perf_event_attr::sig_data. */ +/* Unique value to check si_perf_data is correctly set from perf_event_attr::sig_data. */ #define TEST_SIG_DATA(addr) (~(unsigned long)(addr)) static struct perf_event_attr make_event_attr(bool enabled, volatile void *addr) @@ -164,8 +164,8 @@ TEST_F(sigtrap_threads, enable_event) EXPECT_EQ(ctx.signal_count, NUM_THREADS); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -183,8 +183,8 @@ TEST_F(sigtrap_threads, modify_and_enable_event) EXPECT_EQ(ctx.signal_count, NUM_THREADS); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); /* Check enabled for parent. */ ctx.iterate_on = 0; @@ -203,8 +203,8 @@ TEST_F(sigtrap_threads, signal_stress) EXPECT_EQ(ctx.signal_count, NUM_THREADS * ctx.iterate_on); EXPECT_EQ(ctx.tids_want_signal, 0); EXPECT_EQ(ctx.first_siginfo.si_addr, &ctx.iterate_on); - EXPECT_EQ(ctx.first_siginfo.si_errno, PERF_TYPE_BREAKPOINT); - EXPECT_EQ(ctx.first_siginfo.si_perf, TEST_SIG_DATA(&ctx.iterate_on)); + EXPECT_EQ(ctx.first_siginfo.si_perf_type, PERF_TYPE_BREAKPOINT); + EXPECT_EQ(ctx.first_siginfo.si_perf_data, TEST_SIG_DATA(&ctx.iterate_on)); } TEST_HARNESS_MAIN -- cgit v1.2.3 From 939ef713297df2cc910592305aa26af0e87f28ac Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 May 2021 13:03:16 -0700 Subject: x86/signal: Introduce helpers to get the maximum signal frame size Signal frames do not have a fixed format and can vary in size when a number of things change: supported XSAVE features, 32 vs. 64-bit apps, etc. Add support for a runtime method for userspace to dynamically discover how large a signal stack needs to be. Introduce a new variable, max_frame_size, and helper functions for the calculation to be used in a new user interface. Set max_frame_size to a system-wide worst-case value, instead of storing multiple app-specific values. Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Reviewed-by: Len Brown Acked-by: Thomas Gleixner Acked-by: H.J. Lu Link: https://lkml.kernel.org/r/20210518200320.17239-3-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/signal.h | 2 ++ arch/x86/include/asm/sigframe.h | 2 ++ arch/x86/kernel/cpu/common.c | 3 ++ arch/x86/kernel/fpu/signal.c | 19 +++++++++++++ arch/x86/kernel/signal.c | 59 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 83 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 7fb516b6893a..8b6631dffefd 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -29,6 +29,8 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size); +unsigned long fpu__get_fpstate_size(void); + extern void fpu__init_prepare_fx_sw_frame(void); #endif /* _ASM_X86_FPU_SIGNAL_H */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 84eab2724875..5b1ed650b124 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -85,4 +85,6 @@ struct rt_sigframe_x32 { #endif /* CONFIG_X86_64 */ +void __init init_sigframe_size(void); + #endif /* _ASM_X86_SIGFRAME_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a1b756c49a93..9173dd475818 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "cpu.h" @@ -1332,6 +1333,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) fpu__init_system(c); + init_sigframe_size(); + #ifdef CONFIG_X86_32 /* * Regardless of whether PCID is enumerated, the SDM says diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a4ec65317a7f..dbb304e48f16 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -507,6 +507,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, return sp; } + +unsigned long fpu__get_fpstate_size(void) +{ + unsigned long ret = xstate_sigframe_size(); + + /* + * This space is needed on (most) 32-bit kernels, or when a 32-bit + * app is running on a 64-bit kernel. To keep things simple, just + * assume the worst case and always include space for 'freg_state', + * even for 64-bit apps on 64-bit kernels. This wastes a bit of + * space, but keeps the code simple. + */ + if ((IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) && use_fxsr()) + ret += sizeof(struct fregs_state); + + return ret; +} + /* * Prepare the SW reserved portion of the fxsave memory layout, indicating * the presence of the extended state information in the memory layout diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index a06cb107c0e8..689a4b6dd18f 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -212,6 +212,11 @@ do { \ * Set up a signal frame. */ +/* x86 ABI requires 16-byte alignment */ +#define FRAME_ALIGNMENT 16UL + +#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1) + /* * Determine which stack to use.. */ @@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp) * Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - sp = ((sp + 4) & -16ul) - 4; + sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4; #else /* !CONFIG_X86_32 */ - sp = round_down(sp, 16) - 8; + sp = round_down(sp, FRAME_ALIGNMENT) - 8; #endif return sp; } @@ -663,6 +668,56 @@ badframe: return 0; } +/* + * There are four different struct types for signal frame: sigframe_ia32, + * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case + * -- the largest size. It means the size for 64-bit apps is a bit more + * than needed, but this keeps the code simple. + */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32) +#else +# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe) +#endif + +/* + * The FP state frame contains an XSAVE buffer which must be 64-byte aligned. + * If a signal frame starts at an unaligned address, extra space is required. + * This is the max alignment padding, conservatively. + */ +#define MAX_XSAVE_PADDING 63UL + +/* + * The frame data is composed of the following areas and laid out as: + * + * ------------------------- + * | alignment padding | + * ------------------------- + * | (f)xsave frame | + * ------------------------- + * | fsave header | + * ------------------------- + * | alignment padding | + * ------------------------- + * | siginfo + ucontext | + * ------------------------- + */ + +/* max_frame_size tells userspace the worst case signal stack size. */ +static unsigned long __ro_after_init max_frame_size; + +void __init init_sigframe_size(void) +{ + max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING; + + max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING; + + /* Userspace expects an aligned size. */ + max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT); + + pr_info("max sigframe size: %lu\n", max_frame_size); +} + static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && -- cgit v1.2.3 From 1c33bb0507508af24fd754dd7123bd8e997fab2f Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 May 2021 13:03:17 -0700 Subject: x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for use by all architectures with sigaltstack(2). Over time, the hardware state size grew, but these constants did not evolve. Today, literal use of these constants on several architectures may result in signal stack overflow, and thus user data corruption. A few years ago, the ARM team addressed this issue by establishing getauxval(AT_MINSIGSTKSZ). This enables the kernel to supply a value at runtime that is an appropriate replacement on current and future hardware. Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support added for ARM in 94b07c1f8c39 ("arm64: signal: Report signal frame size to userspace via auxv"). Also, include a documentation to describe x86-specific auxiliary vectors. Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Reviewed-by: Len Brown Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20210518200320.17239-4-chang.seok.bae@intel.com --- Documentation/x86/elf_auxvec.rst | 53 ++++++++++++++++++++++++++++++++++++++ Documentation/x86/index.rst | 1 + arch/x86/include/asm/elf.h | 4 +++ arch/x86/include/uapi/asm/auxvec.h | 4 +-- arch/x86/kernel/signal.c | 5 ++++ 5 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 Documentation/x86/elf_auxvec.rst (limited to 'arch/x86') diff --git a/Documentation/x86/elf_auxvec.rst b/Documentation/x86/elf_auxvec.rst new file mode 100644 index 000000000000..18e4744717f9 --- /dev/null +++ b/Documentation/x86/elf_auxvec.rst @@ -0,0 +1,53 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +x86-specific ELF Auxiliary Vectors +================================== + +This document describes the semantics of the x86 auxiliary vectors. + +Introduction +============ + +ELF Auxiliary vectors enable the kernel to efficiently provide +configuration-specific parameters to userspace. In this example, a program +allocates an alternate stack based on the kernel-provided size:: + + #include + #include + #include + #include + #include + #include + + #ifndef AT_MINSIGSTKSZ + #define AT_MINSIGSTKSZ 51 + #endif + + .... + stack_t ss; + + ss.ss_sp = malloc(ss.ss_size); + assert(ss.ss_sp); + + ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ; + ss.ss_flags = 0; + + if (sigaltstack(&ss, NULL)) + err(1, "sigaltstack"); + + +The exposed auxiliary vectors +============================= + +AT_SYSINFO is used for locating the vsyscall entry point. It is not +exported on 64-bit mode. + +AT_SYSINFO_EHDR is the start address of the page containing the vDSO. + +AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to +deliver a signal to user-space. AT_MINSIGSTKSZ comprehends the space +consumed by the kernel to accommodate the user context for the current +hardware configuration. It does not comprehend subsequent user-space stack +consumption, which must be added by the user. (e.g. Above, user-space adds +SIGSTKSZ to AT_MINSIGSTKSZ.) diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst index 4693e192b447..d58614d5cde6 100644 --- a/Documentation/x86/index.rst +++ b/Documentation/x86/index.rst @@ -35,3 +35,4 @@ x86-specific Documentation sva sgx features + elf_auxvec diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 7d7500806af8..29fea180a665 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -312,6 +312,7 @@ do { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* @@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void); extern unsigned long task_size_64bit(int full_addr_space); extern unsigned long get_mmap_base(int is_legacy); extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); +extern unsigned long get_sigframe_size(void); #ifdef CONFIG_X86_32 @@ -349,6 +351,7 @@ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) /* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ @@ -357,6 +360,7 @@ do { \ if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long __force)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \ } while (0) #define AT_SYSINFO 32 diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h index 580e3c567046..6beb55bbefa4 100644 --- a/arch/x86/include/uapi/asm/auxvec.h +++ b/arch/x86/include/uapi/asm/auxvec.h @@ -12,9 +12,9 @@ /* entries in ARCH_DLINFO: */ #if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) -# define AT_VECTOR_SIZE_ARCH 2 +# define AT_VECTOR_SIZE_ARCH 3 #else /* else it's non-compat x86-64 */ -# define AT_VECTOR_SIZE_ARCH 1 +# define AT_VECTOR_SIZE_ARCH 2 #endif #endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 689a4b6dd18f..86e53868159a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -718,6 +718,11 @@ void __init init_sigframe_size(void) pr_info("max sigframe size: %lu\n", max_frame_size); } +unsigned long get_sigframe_size(void) +{ + return max_frame_size; +} + static inline int is_ia32_compat_frame(struct ksignal *ksig) { return IS_ENABLED(CONFIG_IA32_EMULATION) && -- cgit v1.2.3 From 2beb4a53fc3f1081cedc1c1a198c7f56cc4fc60c Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 May 2021 13:03:19 -0700 Subject: x86/signal: Detect and prevent an alternate signal stack overflow The kernel pushes context on to the userspace stack to prepare for the user's signal handler. When the user has supplied an alternate signal stack, via sigaltstack(2), it is easy for the kernel to verify that the stack size is sufficient for the current hardware context. Check if writing the hardware context to the alternate stack will exceed it's size. If yes, then instead of corrupting user-data and proceeding with the original signal handler, an immediate SIGSEGV signal is delivered. Refactor the stack pointer check code from on_sig_stack() and use the new helper. While the kernel allows new source code to discover and use a sufficient alternate signal stack size, this check is still necessary to protect binaries with insufficient alternate signal stack size from data corruption. Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch") Reported-by: Florian Weimer Suggested-by: Jann Horn Suggested-by: Andy Lutomirski Signed-off-by: Chang S. Bae Signed-off-by: Borislav Petkov Reviewed-by: Len Brown Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20210518200320.17239-6-chang.seok.bae@intel.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=153531 --- arch/x86/kernel/signal.c | 24 ++++++++++++++++++++---- include/linux/sched/signal.h | 19 ++++++++++++------- 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 86e53868159a..2ddcf2165bcb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -239,10 +239,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { /* Default to using normal stack */ + bool nested_altstack = on_sig_stack(regs->sp); + bool entering_altstack = false; unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; - int onsigstack = on_sig_stack(sp); int ret; /* redzone */ @@ -251,15 +252,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(sp) == 0) + /* + * This checks nested_altstack via sas_ss_flags(). Sensible + * programs use SS_AUTODISARM, which disables that check, and + * programs that don't use SS_AUTODISARM get compatible. + */ + if (sas_ss_flags(sp) == 0) { sp = current->sas_ss_sp + current->sas_ss_size; + entering_altstack = true; + } } else if (IS_ENABLED(CONFIG_X86_32) && - !onsigstack && + !nested_altstack && regs->ss != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { /* This is the legacy signal stack switching. */ sp = (unsigned long) ka->sa.sa_restorer; + entering_altstack = true; } sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), @@ -272,8 +281,15 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, * If we are on the alternate signal stack and would overflow it, don't. * Return an always-bogus address instead so we will die with SIGSEGV. */ - if (onsigstack && !likely(on_sig_stack(sp))) + if (unlikely((nested_altstack || entering_altstack) && + !__on_sig_stack(sp))) { + + if (show_unhandled_signals && printk_ratelimit()) + pr_info("%s[%d] overflowed sigaltstack\n", + current->comm, task_pid_nr(current)); + return (void __user *)-1L; + } /* save i387 and extended state */ ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3f6a0fcaa10c..ae60f838ebb9 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -537,6 +537,17 @@ static inline int kill_cad_pid(int sig, int priv) #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) +static inline int __on_sig_stack(unsigned long sp) +{ +#ifdef CONFIG_STACK_GROWSUP + return sp >= current->sas_ss_sp && + sp - current->sas_ss_sp < current->sas_ss_size; +#else + return sp > current->sas_ss_sp && + sp - current->sas_ss_sp <= current->sas_ss_size; +#endif +} + /* * True if we are on the alternate signal stack. */ @@ -554,13 +565,7 @@ static inline int on_sig_stack(unsigned long sp) if (current->sas_ss_flags & SS_AUTODISARM) return 0; -#ifdef CONFIG_STACK_GROWSUP - return sp >= current->sas_ss_sp && - sp - current->sas_ss_sp < current->sas_ss_size; -#else - return sp > current->sas_ss_sp && - sp - current->sas_ss_sp <= current->sas_ss_size; -#endif + return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) -- cgit v1.2.3 From 0024430e920f2900654ad83cd081cf52e02a3ef5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 18 May 2021 12:01:06 -0700 Subject: x86/build: Fix location of '-plugin-opt=' flags Commit b33fff07e3e3 ("x86, build: allow LTO to be selected") added a couple of '-plugin-opt=' flags to KBUILD_LDFLAGS because the code model and stack alignment are not stored in LLVM bitcode. However, these flags were added to KBUILD_LDFLAGS prior to the emulation flag assignment, which uses ':=', so they were overwritten and never added to $(LD) invocations. The absence of these flags caused misalignment issues in the AMDGPU driver when compiling with CONFIG_LTO_CLANG, resulting in general protection faults. Shuffle the assignment below the initial one so that the flags are properly passed along and all of the linker flags stay together. At the same time, avoid any future issues with clobbering flags by changing the emulation flag assignment to '+=' since KBUILD_LDFLAGS is already defined with ':=' in the main Makefile before being exported for modification here as a result of commit: ce99d0bf312d ("kbuild: clear LDFLAGS in the top Makefile") Fixes: b33fff07e3e3 ("x86, build: allow LTO to be selected") Reported-by: Anthony Ruhier Signed-off-by: Nathan Chancellor Signed-off-by: Ingo Molnar Tested-by: Anthony Ruhier Cc: stable@vger.kernel.org Link: https://github.com/ClangBuiltLinux/linux/issues/1374 Link: https://lore.kernel.org/r/20210518190106.60935-1-nathan@kernel.org --- arch/x86/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c77c5d8a7b3e..307529417021 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -178,11 +178,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif -ifdef CONFIG_LTO_CLANG -KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ - -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) -endif - # Workaround for a gcc prelease that unfortunately was shipped in a suse release KBUILD_CFLAGS += -Wno-sign-compare # @@ -202,7 +197,12 @@ ifdef CONFIG_RETPOLINE endif endif -KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) + +ifdef CONFIG_LTO_CLANG +KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ + -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +endif ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs --discard-none -- cgit v1.2.3 From b250f2f7792d15bcde98e0456781e2835556d5fa Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 19 May 2021 15:52:44 +0200 Subject: x86/sev-es: Don't return NULL from sev_es_get_ghcb() sev_es_get_ghcb() is called from several places but only one of them checks the return value. The reaction to returning NULL is always the same: calling panic() and kill the machine. Instead of adding checks to all call sites, move the panic() into the function itself so that it will no longer return NULL. Fixes: 0786138c78e7 ("x86/sev-es: Add a Runtime #VC Exception Handler") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v5.10+ Link: https://lkml.kernel.org/r/20210519135251.30093-2-joro@8bytes.org --- arch/x86/kernel/sev.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 4fa111becc93..82bced88153b 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -203,8 +203,18 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) if (unlikely(data->ghcb_active)) { /* GHCB is already in use - save its contents */ - if (unlikely(data->backup_ghcb_active)) - return NULL; + if (unlikely(data->backup_ghcb_active)) { + /* + * Backup-GHCB is also already in use. There is no way + * to continue here so just kill the machine. To make + * panic() work, mark GHCBs inactive so that messages + * can be printed out. + */ + data->ghcb_active = false; + data->backup_ghcb_active = false; + + panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + } /* Mark backup_ghcb active before writing to it */ data->backup_ghcb_active = true; @@ -1289,7 +1299,6 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) */ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) { - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; @@ -1315,16 +1324,6 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) */ ghcb = sev_es_get_ghcb(&state); - if (!ghcb) { - /* - * Mark GHCBs inactive so that panic() is able to print the - * message. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - } vc_ghcb_invalidate(ghcb); result = vc_init_em_ctxt(&ctxt, regs, error_code); -- cgit v1.2.3 From c25bbdb564060adaad5c3a8a10765c13487ba6a3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 19 May 2021 15:52:45 +0200 Subject: x86/sev-es: Forward page-faults which happen during emulation When emulating guest instructions for MMIO or IOIO accesses, the #VC handler might get a page-fault and will not be able to complete. Forward the page-fault in this case to the correct handler instead of killing the machine. Fixes: 0786138c78e7 ("x86/sev-es: Add a Runtime #VC Exception Handler") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v5.10+ Link: https://lkml.kernel.org/r/20210519135251.30093-3-joro@8bytes.org --- arch/x86/kernel/sev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 82bced88153b..1f428f401bed 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1270,6 +1270,10 @@ static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) case X86_TRAP_UD: exc_invalid_op(ctxt->regs); break; + case X86_TRAP_PF: + write_cr2(ctxt->fi.cr2); + exc_page_fault(ctxt->regs, error_code); + break; case X86_TRAP_AC: exc_alignment_check(ctxt->regs, error_code); break; -- cgit v1.2.3 From 4954f5b8ef0baf70fe978d1a99a5f70e4dd5c877 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 19 May 2021 15:52:46 +0200 Subject: x86/sev-es: Use __put_user()/__get_user() for data accesses The put_user() and get_user() functions do checks on the address which is passed to them. They check whether the address is actually a user-space address and whether its fine to access it. They also call might_fault() to indicate that they could fault and possibly sleep. All of these checks are neither wanted nor needed in the #VC exception handler, which can be invoked from almost any context and also for MMIO instructions from kernel space on kernel memory. All the #VC handler wants to know is whether a fault happened when the access was tried. This is provided by __put_user()/__get_user(), which just do the access no matter what. Also add comments explaining why __get_user() and __put_user() are the best choice here and why it is safe to use them in this context. Also explain why copy_to/from_user can't be used. In addition, also revert commit 7024f60d6552 ("x86/sev-es: Handle string port IO to kernel memory properly") because using __get_user()/__put_user() fixes the same problem while the above commit introduced several problems: 1) It uses access_ok() which is only allowed in task context. 2) It uses memcpy() which has no fault handling at all and is thus unsafe to use here. [ bp: Fix up commit ID of the reverted commit above. ] Fixes: f980f9c31a92 ("x86/sev-es: Compile early handler code into kernel image") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v5.10+ Link: https://lkml.kernel.org/r/20210519135251.30093-4-joro@8bytes.org --- arch/x86/kernel/sev.c | 66 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 1f428f401bed..651b81cd648e 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -315,31 +315,44 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(target, size)) { - memcpy(dst, buf, size); - return ES_OK; - } - + /* + * This function uses __put_user() independent of whether kernel or user + * memory is accessed. This works fine because __put_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __put_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_to_user() here because + * vc_write_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ switch (size) { case 1: memcpy(&d1, buf, 1); - if (put_user(d1, target)) + if (__put_user(d1, target)) goto fault; break; case 2: memcpy(&d2, buf, 2); - if (put_user(d2, target)) + if (__put_user(d2, target)) goto fault; break; case 4: memcpy(&d4, buf, 4); - if (put_user(d4, target)) + if (__put_user(d4, target)) goto fault; break; case 8: memcpy(&d8, buf, 8); - if (put_user(d8, target)) + if (__put_user(d8, target)) goto fault; break; default: @@ -370,30 +383,43 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, u16 d2; u8 d1; - /* If instruction ran in kernel mode and the I/O buffer is in kernel space */ - if (!user_mode(ctxt->regs) && !access_ok(s, size)) { - memcpy(buf, src, size); - return ES_OK; - } - + /* + * This function uses __get_user() independent of whether kernel or user + * memory is accessed. This works fine because __get_user() does no + * sanity checks of the pointer being accessed. All that it does is + * to report when the access failed. + * + * Also, this function runs in atomic context, so __get_user() is not + * allowed to sleep. The page-fault handler detects that it is running + * in atomic context and will not try to take mmap_sem and handle the + * fault, so additional pagefault_enable()/disable() calls are not + * needed. + * + * The access can't be done via copy_from_user() here because + * vc_read_mem() must not use string instructions to access unsafe + * memory. The reason is that MOVS is emulated by the #VC handler by + * splitting the move up into a read and a write and taking a nested #VC + * exception on whatever of them is the MMIO access. Using string + * instructions here would cause infinite nesting. + */ switch (size) { case 1: - if (get_user(d1, s)) + if (__get_user(d1, s)) goto fault; memcpy(buf, &d1, 1); break; case 2: - if (get_user(d2, s)) + if (__get_user(d2, s)) goto fault; memcpy(buf, &d2, 2); break; case 4: - if (get_user(d4, s)) + if (__get_user(d4, s)) goto fault; memcpy(buf, &d4, 4); break; case 8: - if (get_user(d8, s)) + if (__get_user(d8, s)) goto fault; memcpy(buf, &d8, 8); break; -- cgit v1.2.3 From 2e958a8a510d956ec8528f0bd20e309b5bb5156c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:09 +0900 Subject: x86/entry/x32: Rename __x32_compat_sys_* to __x64_compat_sys_* The SYSCALL macros are mapped to symbols as follows: __SYSCALL_COMMON(nr, sym) --> __x64_ __SYSCALL_X32(nr, sym) --> __x32_ Originally, the syscalls in the x32 special range (512-547) were all compat. This assumption is now broken after the following commits: 55db9c0e8534 ("net: remove compat_sys_{get,set}sockopt") 5f764d624a89 ("fs: remove the compat readv/writev syscalls") 598b3cec831f ("fs: remove compat_sys_vmsplice") c3973b401ef2 ("mm: remove compat_process_vm_{readv,writev}") Those commits redefined __x32_sys_* to __x64_sys_* because there is no stub like __x32_sys_*. Defining them as follows is more sensible and cleaner. __SYSCALL_COMMON(nr, sym) --> __x64_ __SYSCALL_X32(nr, sym) --> __x64_ This works because both x86_64 and x32 use the same ABI (RDI, RSI, RDX, R10, R8, R9) The ugly #define __x32_sys_* will go away. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-2-masahiroy@kernel.org --- arch/x86/entry/syscall_x32.c | 16 ++-------------- arch/x86/include/asm/syscall_wrapper.h | 10 +++++----- 2 files changed, 7 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index f2fe0a33bcfd..3fea8fb9cd6a 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,27 +8,15 @@ #include #include -/* - * Reuse the 64-bit entry points for the x32 versions that occupy different - * slots in the syscall table. - */ -#define __x32_sys_readv __x64_sys_readv -#define __x32_sys_writev __x64_sys_writev -#define __x32_sys_getsockopt __x64_sys_getsockopt -#define __x32_sys_setsockopt __x64_sys_setsockopt -#define __x32_sys_vmsplice __x64_sys_vmsplice -#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv -#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev - #define __SYSCALL_64(nr, sym) -#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym) extern long __x64_##sym(const struct pt_regs *); #define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL_X32 #undef __SYSCALL_COMMON -#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym, +#define __SYSCALL_X32(nr, sym) [nr] = __x64_##sym, #define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index 80c08c7d5e72..6a2827d0681f 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -17,7 +17,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * __x64_sys_*() - 64-bit native syscall * __ia32_sys_*() - 32-bit native syscall or common compat syscall * __ia32_compat_sys_*() - 32-bit compat syscall - * __x32_compat_sys_*() - 64-bit X32 compat syscall + * __x64_compat_sys_*() - 64-bit X32 compat syscall * * The registers are decoded according to the ABI: * 64-bit: RDI, RSI, RDX, R10, R8, R9 @@ -166,17 +166,17 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs); * with x86_64 obviously do not need such care. */ #define __X32_COMPAT_SYS_STUB0(name) \ - __SYS_STUB0(x32, compat_sys_##name) + __SYS_STUB0(x64, compat_sys_##name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ - __SYS_STUBx(x32, compat_sys##name, \ + __SYS_STUBx(x64, compat_sys##name, \ SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__)) #define __X32_COMPAT_COND_SYSCALL(name) \ - __COND_SYSCALL(x32, compat_sys_##name) + __COND_SYSCALL(x64, compat_sys_##name) #define __X32_COMPAT_SYS_NI(name) \ - __SYS_NI(x32, compat_sys_##name) + __SYS_NI(x64, compat_sys_##name) #else /* CONFIG_X86_X32 */ #define __X32_COMPAT_SYS_STUB0(name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) -- cgit v1.2.3 From 6218d0f6b8dece1f2e82f0a47a0e6b8ecb631ef6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:10 +0900 Subject: x86/syscalls: Switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. Convert x86 and UML to use scripts/syscalltbl.sh. The generic script generates seperate headers for x86/64 and x86/x32 syscalls, while the x86 specific script coalesced them into one. Adjust the code accordingly. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-3-masahiroy@kernel.org --- arch/x86/entry/syscall_32.c | 12 ++++++--- arch/x86/entry/syscall_64.c | 9 +++---- arch/x86/entry/syscall_x32.c | 15 ++++-------- arch/x86/entry/syscalls/Makefile | 10 ++++++-- arch/x86/entry/syscalls/syscalltbl.sh | 46 ----------------------------------- arch/x86/include/asm/Kbuild | 1 + arch/x86/um/sys_call_table_32.c | 8 +++--- arch/x86/um/sys_call_table_64.c | 9 +++---- 8 files changed, 34 insertions(+), 76 deletions(-) delete mode 100644 arch/x86/entry/syscalls/syscalltbl.sh (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 86eb0d89d46f..70bf46e73b1c 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -8,12 +8,18 @@ #include #include -#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *); +#ifdef CONFIG_IA32_EMULATION +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#else +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#endif + +#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); #include -#undef __SYSCALL_I386 +#undef __SYSCALL -#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym, +#define __SYSCALL(nr, sym) [nr] = __ia32_##sym, __visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 1594ec72bcbb..82670bb10931 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -8,14 +8,11 @@ #include #include -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include -#undef __SYSCALL_64 +#undef __SYSCALL -#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3fea8fb9cd6a..6d2ef887d7b6 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,16 +8,11 @@ #include #include -#define __SYSCALL_64(nr, sym) +#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); +#include +#undef __SYSCALL -#define __SYSCALL_X32(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *); -#include -#undef __SYSCALL_X32 -#undef __SYSCALL_COMMON - -#define __SYSCALL_X32(nr, sym) [nr] = __x64_##sym, -#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) [nr] = __x64_##sym, asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { /* @@ -25,5 +20,5 @@ asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { * when the & below is removed. */ [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, -#include +#include }; diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index d8c4f6c9eadc..c4bd8dd82bb1 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -10,7 +10,7 @@ syscall32 := $(src)/syscall_32.tbl syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ @@ -18,7 +18,7 @@ quiet_cmd_syshdr = SYSHDR $@ '$(syshdr_pfx_$(basetarget))' \ '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) @@ -46,10 +46,15 @@ syshdr_pfx_unistd_64_x32 := x32_ $(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) +$(out)/syscalls_32.h: abis := i386 $(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_64.h: abis := common,64 $(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE $(call if_changed,systbl) +$(out)/syscalls_x32.h: abis := common,x32 +$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE + $(call if_changed,systbl) $(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE $(call if_changed,hypercalls) @@ -60,6 +65,7 @@ uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h syshdr-y += syscalls_32.h syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h syshdr-$(CONFIG_X86_64) += syscalls_64.h +syshdr-$(CONFIG_X86_X32) += syscalls_x32.h syshdr-$(CONFIG_XEN) += xen-hypercalls.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh deleted file mode 100644 index 929bde120d6b..000000000000 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" - -syscall_macro() { - local abi="$1" - local nr="$2" - local entry="$3" - - echo "__SYSCALL_${abi}($nr, $entry)" -} - -emit() { - local abi="$1" - local nr="$2" - local entry="$3" - local compat="$4" - - if [ "$abi" != "I386" -a -n "$compat" ]; then - echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 - exit 1 - fi - - if [ -z "$compat" ]; then - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - else - echo "#ifdef CONFIG_X86_32" - if [ -n "$entry" ]; then - syscall_macro "$abi" "$nr" "$entry" - fi - echo "#else" - syscall_macro "$abi" "$nr" "$compat" - echo "#endif" - fi -} - -grep '^[0-9]' "$in" | sort -n | ( - while read nr abi name entry compat; do - abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` - emit "$abi" "$nr" "$entry" "$compat" - done -) > "$out" diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index b19ec8282d50..1e51650b79d7 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -3,6 +3,7 @@ generated-y += syscalls_32.h generated-y += syscalls_64.h +generated-y += syscalls_x32.h generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 2ed81e581755..e83619c365dc 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -26,11 +26,13 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) + +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include -#undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 2e8544dafbb0..6fb75af7cf54 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -36,14 +36,11 @@ #define stub_execveat sys_execveat #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL_X32(nr, sym) -#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym) - -#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; +#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); #include -#undef __SYSCALL_64 -#define __SYSCALL_64(nr, sym) [ nr ] = sym, +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -- cgit v1.2.3 From 44fe4895f47cbe9f4692e1d3cdc2ef8352f4d88e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:11 +0900 Subject: x86/syscalls: Stop filling syscall arrays with *_sys_ni_syscall This is a follow-up cleanup after switching to the generic syscalltbl.sh. The old x86 specific script skipped non-existing syscalls. So, the generated syscalls_64.h, for example, had a big hole in the syscall numbers 335-423 range. That is why there exists [0 ... __NR_*_syscall_max] = &__*_sys_ni_cyscall. The new script, scripts/syscalltbl.sh automatically fills holes with __SYSCALL(, sys_ni_syscall), hence such ugly code can go away. The designated initializers, '[nr] =' are also unneeded. Also, there is no need to give __NR_*_syscall_max+1 because the array size is implied by the number of syscalls in the generated headers. Hence, there is no need to include , either. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-4-masahiroy@kernel.org --- arch/x86/entry/syscall_32.c | 10 ++-------- arch/x86/entry/syscall_64.c | 10 ++-------- arch/x86/entry/syscall_x32.c | 10 ++-------- arch/x86/um/sys_call_table_32.c | 6 ------ arch/x86/um/sys_call_table_64.c | 6 ------ 5 files changed, 6 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 70bf46e73b1c..8cfc9bc73e7f 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #ifdef CONFIG_IA32_EMULATION @@ -19,13 +18,8 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __ia32_##sym, +#define __SYSCALL(nr, sym) __ia32_##sym, -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall, +__visible const sys_call_ptr_t ia32_sys_call_table[] = { #include }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 82670bb10931..be120eec1fc9 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -5,20 +5,14 @@ #include #include #include -#include #include #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) __x64_##sym, -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, +asmlinkage const sys_call_ptr_t sys_call_table[] = { #include }; diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 6d2ef887d7b6..bdd0e03a1265 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -5,20 +5,14 @@ #include #include #include -#include #include #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *); #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = __x64_##sym, +#define __SYSCALL(nr, sym) __x64_##sym, -asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall, +asmlinkage const sys_call_ptr_t x32_sys_call_table[] = { #include }; diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index e83619c365dc..f8323104e353 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #define __NO_STUBS @@ -37,11 +36,6 @@ extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 6fb75af7cf54..5ed665dc785f 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #define __NO_STUBS @@ -45,11 +44,6 @@ extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; -- cgit v1.2.3 From f63815eb1d909a4121806e60928108ff040bf291 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:12 +0900 Subject: x86/unistd: Define X32_NR_syscalls only for 64-bit kernel X32_NR_syscalls is needed only when building a 64bit kernel. Move it to proper #ifdef guard. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-5-masahiroy@kernel.org --- arch/x86/include/asm/unistd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index c1c3d31b15c0..1bc6020bc58d 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -26,11 +26,11 @@ # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +# define X32_NR_syscalls (__NR_x32_syscall_max + 1) # endif # define NR_syscalls (__NR_syscall_max + 1) -# define X32_NR_syscalls (__NR_x32_syscall_max + 1) # define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) # define __ARCH_WANT_NEW_STAT -- cgit v1.2.3 From 49f731f1972e6e44d8a5c3982a72902b3944bc34 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:13 +0900 Subject: x86/syscalls: Use __NR_syscalls instead of __NR_syscall_max __NR_syscall_max is only used by x86 and UML. In contrast, __NR_syscalls is widely used by all the architectures. Convert __NR_syscall_max to __NR_syscalls and adjust the usage sites. This prepares x86 to switch to the generic syscallhdr.sh script. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-6-masahiroy@kernel.org --- arch/um/kernel/skas/syscall.c | 2 +- arch/x86/entry/syscalls/syscallhdr.sh | 2 +- arch/x86/include/asm/unistd.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 3d91f89fd852..9ee19e566da3 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -41,7 +41,7 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); - if (syscall >= 0 && syscall <= __NR_syscall_max) + if (syscall >= 0 && syscall < __NR_syscalls) PT_REGS_SET_SYSCALL_RETURN(regs, EXECUTE_SYSCALL(syscall, regs)); diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh index cc1e63857427..75e66af06773 100644 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ b/arch/x86/entry/syscalls/syscallhdr.sh @@ -28,7 +28,7 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "" echo "#ifdef __KERNEL__" - echo "#define __NR_${prefix}syscall_max $max" + echo "#define __NR_${prefix}syscalls $(($max + 1))" echo "#endif" echo "" echo "#endif /* ${fileguard} */" diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 1bc6020bc58d..80e9d5206a71 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -13,7 +13,7 @@ # define __ARCH_WANT_SYS_OLD_MMAP # define __ARCH_WANT_SYS_OLD_SELECT -# define __NR_ia32_syscall_max __NR_syscall_max +# define IA32_NR_syscalls (__NR_syscalls) # else @@ -26,12 +26,12 @@ # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 -# define X32_NR_syscalls (__NR_x32_syscall_max + 1) +# define X32_NR_syscalls (__NR_x32_syscalls) +# define IA32_NR_syscalls (__NR_ia32_syscalls) # endif -# define NR_syscalls (__NR_syscall_max + 1) -# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1) +# define NR_syscalls (__NR_syscalls) # define __ARCH_WANT_NEW_STAT # define __ARCH_WANT_OLD_READDIR -- cgit v1.2.3 From 3cba325b358f86357b5ce50eb9e6633183927eee Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 17 May 2021 16:38:14 +0900 Subject: x86/syscalls: Switch to generic syscallhdr.sh Many architectures duplicate similar shell scripts. Converts x86 to use scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210517073815.97426-7-masahiroy@kernel.org --- arch/x86/entry/syscalls/Makefile | 26 +++++++++++++------------- arch/x86/entry/syscalls/syscallhdr.sh | 35 ----------------------------------- 2 files changed, 13 insertions(+), 48 deletions(-) delete mode 100644 arch/x86/entry/syscalls/syscallhdr.sh (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index c4bd8dd82bb1..8eb014bca8c9 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -9,40 +9,40 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ syscall32 := $(src)/syscall_32.tbl syscall64 := $(src)/syscall_64.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \ + $(if $(offset),--offset $(offset)) \ + $(if $(prefix),--prefix $(prefix)) \ + $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_hypercalls = HYPERCALLS $@ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs)) -syshdr_abi_unistd_32 := i386 +$(uapi)/unistd_32.h: abis := i386 $(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_32_ia32 := i386 -syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: abis := i386 +$(out)/unistd_32_ia32.h: prefix := ia32_ $(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_x32 := common,x32 -syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT +$(uapi)/unistd_x32.h: abis := common,x32 +$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT $(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd_64_x32 := x32 -syshdr_pfx_unistd_64_x32 := x32_ +$(out)/unistd_64_x32.h: abis := x32 +$(out)/unistd_64_x32.h: prefix := x32_ $(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh deleted file mode 100644 index 75e66af06773..000000000000 --- a/arch/x86/entry/syscalls/syscallhdr.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_X86_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - max=0 - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - - max=$nr - done - - echo "" - echo "#ifdef __KERNEL__" - echo "#define __NR_${prefix}syscalls $(($max + 1))" - echo "#endif" - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" -- cgit v1.2.3 From 0595494891723a1dcca5eaa8eeca8ab54ad953b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:01 -0700 Subject: x86/entry/64: Sign-extend system calls on entry to int Right now, *some* code will treat e.g. 0x0000000100000001 as a system call and some will not. Some of the code, notably in ptrace, will treat 0x000000018000000 as a system call and some will not. Finally, right now, e.g. 335 for x86-64 will force the exit code to be set to -ENOSYS even if poked by ptrace, but 548 will not, because there is an observable difference between an out of range system call and a system call number that falls outside the range of the table. This is visible to the user: for example, the syscall_numbering_64 test fails if run under strace, because as strace uses ptrace, it ends up clobbering the upper half of the 64-bit system call number. The architecture independent code all assumes that a system call is "int" that the value -1 specifically and not just any negative value is used for a non-system call. This is the case on x86 as well when arch-independent code is involved. The arch-independent API is defined/documented (but not *implemented*!) in . This is an ABI change, but is in fact a revert to the original x86-64 ABI. The original assembly entry code would zero-extend the system call number; Use sign extend to be explicit that this is treated as a signed number (although in practice it makes no difference, of course) and to avoid people getting the idea of "optimizing" it, as has happened on at least two(!) separate occasions. Do not store the extended value into regs->orig_ax, however: on x86-64, the ABI is that the callee is responsible for extending parameters, so only examining the lower 32 bits is fully consistent with any "int" argument to any system call, e.g. regs->di for write(2). The full value of %rax on entry to the kernel is thus still available. [ tglx: Add a comment to the ASM code ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-5-hpa@zytor.com --- arch/x86/entry/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1d9db15fdc69..a5f02d03c585 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -108,7 +108,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) /* IRQs are off. */ movq %rsp, %rdi - movq %rax, %rsi + /* Sign extend the lower 32bit as syscall numbers are treated as int */ + movslq %eax, %rsi call do_syscall_64 /* returns with IRQs disabled */ /* -- cgit v1.2.3 From b337b4965e3a3e567f11828a9e3fe3fb3faefa47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:02 -0700 Subject: x86/entry: Treat out of range and gap system calls the same The current 64-bit system call entry code treats out-of-range system calls differently than system calls that map to a hole in the system call table. This is visible to the user if system calls are intercepted via ptrace or seccomp and the return value (regs->ax) is modified: in the former case, the return value is preserved, and in the latter case, sys_ni_syscall() is called and the return value is forced to -ENOSYS. The API spec in is very clear that only (int)-1 is the non-system-call sentinel value, so make the system call behavior consistent by calling sys_ni_syscall() for all invalid system call numbers except for -1. Although currently sys_ni_syscall() simply returns -ENOSYS, calling it explicitly is friendly for tracing and future possible extensions, and as this is an error path there is no reason to optimize it. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-6-hpa@zytor.com --- arch/x86/entry/common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 00da0f5420de..f51bc17262db 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -52,6 +52,8 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr) X32_NR_syscalls); regs->ax = x32_sys_call_table[nr](regs); #endif + } else if (unlikely((int)nr != -1)) { + regs->ax = __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); @@ -76,6 +78,8 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, if (likely(nr < IA32_NR_syscalls)) { nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); + } else if (unlikely((int)nr != -1)) { + regs->ax = __ia32_sys_ni_syscall(regs); } } -- cgit v1.2.3 From ae897fda4f507e4b239f0bdfd578b3688ca96fb4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 20 May 2021 13:42:42 +0200 Subject: x86/Xen: swap NX determination and GDT setup on BSP xen_setup_gdt(), via xen_load_gdt_boot(), wants to adjust page tables. For this to work when NX is not available, x86_configure_nx() needs to be called first. [jgross] Note that this is a revert of 36104cb9012a82e73 ("x86/xen: Delay get_cpu_cap until stack canary is established"), which is possible now that we no longer support running as PV guest in 32-bit mode. Cc: # 5.9 Fixes: 36104cb9012a82e73 ("x86/xen: Delay get_cpu_cap until stack canary is established") Reported-by: Olaf Hering Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/12a866b0-9e89-59f7-ebeb-a2a6cec0987a@suse.com Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_pv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 17503fed2017..e87699aa2dc8 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1273,16 +1273,16 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Get mfn list */ xen_build_dynamic_phys_to_machine(); + /* Work out if we support NX */ + get_cpu_cap(&boot_cpu_data); + x86_configure_nx(); + /* * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed. */ xen_setup_gdt(0); - /* Work out if we support NX */ - get_cpu_cap(&boot_cpu_data); - x86_configure_nx(); - /* Determine virtual and physical address sizes */ get_cpu_address_sizes(&boot_cpu_data); -- cgit v1.2.3 From d06aca989c243dd9e5d3e20aa4e5c2ecfdd07050 Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Fri, 21 May 2021 01:58:42 -0700 Subject: x86/elf: Use _BITUL() macro in UAPI headers Replace BIT() in x86's UAPI header with _BITUL(). BIT() is not defined in the UAPI headers and its usage may cause userspace build errors. Fixes: 742c45c3ecc9 ("x86/elf: Enumerate kernel FSGSBASE capability in AT_HWCAP2") Signed-off-by: Joe Richey Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210521085849.37676-2-joerichey94@gmail.com --- arch/x86/include/uapi/asm/hwcap2.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 5fdfcb47000f..054604aba9f0 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -2,10 +2,12 @@ #ifndef _ASM_X86_HWCAP2_H #define _ASM_X86_HWCAP2_H +#include + /* MONITOR/MWAIT enabled in Ring 3 */ -#define HWCAP2_RING3MWAIT (1 << 0) +#define HWCAP2_RING3MWAIT _BITUL(0) /* Kernel allows FSGSBASE instructions available in Ring 3 */ -#define HWCAP2_FSGSBASE BIT(1) +#define HWCAP2_FSGSBASE _BITUL(1) #endif -- cgit v1.2.3 From 2ade8fc65076095460e3ea1ca65a8f619d7d9a3a Mon Sep 17 00:00:00 2001 From: David Bartley Date: Thu, 20 May 2021 10:41:30 -0700 Subject: x86/amd_nb: Add AMD family 19h model 50h PCI ids This is required to support Zen3 APUs in k10temp. Signed-off-by: David Bartley Signed-off-by: Borislav Petkov Acked-by: Wei Huang Link: https://lkml.kernel.org/r/20210520174130.94954-1-andareed@gmail.com --- arch/x86/kernel/amd_nb.c | 3 +++ include/linux/pci_ids.h | 1 + 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 09083094eb57..23dda362dc0f 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -25,6 +25,7 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e /* Protect the PCI config register pairs used for SMN and DF indirect access. */ static DEFINE_MUTEX(smn_mutex); @@ -57,6 +58,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, {} }; @@ -72,6 +74,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 4c3fa5293d76..5356ccf1c275 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -555,6 +555,7 @@ #define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443 #define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653 +#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit v1.2.3 From f1b7d45d3f8f3e18e190e71cb54d4b1917300d1d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Wed, 19 May 2021 14:21:49 -0700 Subject: x86/irq: Remove unused vectors defines UV_BAU_MESSAGE is defined but not used anywhere in the kernel. Presumably this is a stale vector number that can be reclaimed. MCE_VECTOR is not an actual vector: #MC is an exception, not an interrupt vector, and as such is correctly described as X86_TRAP_MC. MCE_VECTOR is not used anywhere is the kernel. Note that NMI_VECTOR *is* used; specifically it is the vector number programmed into the APIC LVT when an NMI interrupt is configured. At the moment it is always numerically identical to X86_TRAP_NMI, that is not necessarily going to be the case indefinitely. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Acked-by: Steve Wahl Link: https://lore.kernel.org/r/20210519212154.511983-4-hpa@zytor.com --- arch/x86/include/asm/irq_vectors.h | 4 ++-- tools/arch/x86/include/asm/irq_vectors.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 889f8b1b5b7f..dc71b781be42 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -26,8 +26,8 @@ * This file enumerates the exact layout of them: */ +/* This is used as an interrupt vector when programming the APIC. */ #define NMI_VECTOR 0x02 -#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start at 0x20. @@ -84,7 +84,7 @@ */ #define IRQ_WORK_VECTOR 0xf6 -#define UV_BAU_MESSAGE 0xf5 +/* 0xf5 - unused, was UV_BAU_MESSAGE */ #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index 889f8b1b5b7f..dc71b781be42 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -26,8 +26,8 @@ * This file enumerates the exact layout of them: */ +/* This is used as an interrupt vector when programming the APIC. */ #define NMI_VECTOR 0x02 -#define MCE_VECTOR 0x12 /* * IDT vectors usable for external interrupt sources start at 0x20. @@ -84,7 +84,7 @@ */ #define IRQ_WORK_VECTOR 0xf6 -#define UV_BAU_MESSAGE 0xf5 +/* 0xf5 - unused, was UV_BAU_MESSAGE */ #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ -- cgit v1.2.3 From ff851003880de9d1111498877551ba16668c38ef Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Wed, 19 May 2021 14:21:48 -0700 Subject: x86/irq: Add and use NR_EXTERNAL_VECTORS and NR_SYSTEM_VECTORS Add defines for the number of external vectors and number of system vectors instead of requiring the use of (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) and (NR_VECTORS - FIRST_SYSTEM_VECTOR) respectively. Clean up the usage sites. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20210519212154.511983-3-hpa@zytor.com --- arch/x86/include/asm/idtentry.h | 4 ++-- arch/x86/include/asm/irq_vectors.h | 3 +++ tools/arch/x86/include/asm/irq_vectors.h | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 73d45b0dfff2..c03a18cac78e 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -504,7 +504,7 @@ __visible noinstr void func(struct pt_regs *regs, \ .align 8 SYM_CODE_START(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR - .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + .rept NR_EXTERNAL_VECTORS UNWIND_HINT_IRET_REGS 0 : .byte 0x6a, vector @@ -520,7 +520,7 @@ SYM_CODE_END(irq_entries_start) .align 8 SYM_CODE_START(spurious_entries_start) vector=FIRST_SYSTEM_VECTOR - .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + .rept NR_SYSTEM_VECTORS UNWIND_HINT_IRET_REGS 0 : .byte 0x6a, vector diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index dc71b781be42..43dcb9284208 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -114,6 +114,9 @@ #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif +#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) +#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) + /* * Size the maximum number of interrupts. * diff --git a/tools/arch/x86/include/asm/irq_vectors.h b/tools/arch/x86/include/asm/irq_vectors.h index dc71b781be42..43dcb9284208 100644 --- a/tools/arch/x86/include/asm/irq_vectors.h +++ b/tools/arch/x86/include/asm/irq_vectors.h @@ -114,6 +114,9 @@ #define FIRST_SYSTEM_VECTOR NR_VECTORS #endif +#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) +#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR) + /* * Size the maximum number of interrupts. * -- cgit v1.2.3 From 8ec9069a432c873e52e6f4ce1496f282a4299604 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Wed, 19 May 2021 14:21:50 -0700 Subject: x86/idt: Remove address argument from idt_invalidate() There is no reason to specify any specific address to idt_invalidate(). It looks mostly like an artifact of unifying code done differently by accident. The most "sensible" address to set here is a NULL pointer - virtual address zero, just as a visual marker. This also makes it possible to mark the struct desc_ptr in idt_invalidate() as static const. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210519212154.511983-5-hpa@zytor.com --- arch/x86/include/asm/desc.h | 2 +- arch/x86/kernel/idt.c | 5 ++--- arch/x86/kernel/machine_kexec_32.c | 2 +- arch/x86/kernel/reboot.c | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 476082a83d1c..b8429ae50b71 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -427,6 +427,6 @@ static inline void idt_setup_early_pf(void) { } static inline void idt_setup_ist_traps(void) { } #endif -extern void idt_invalidate(void *addr); +extern void idt_invalidate(void); #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d552f177eca0..2779f5226dc2 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -331,11 +331,10 @@ void __init idt_setup_early_handler(void) /** * idt_invalidate - Invalidate interrupt descriptor table - * @addr: The virtual address of the 'invalid' IDT */ -void idt_invalidate(void *addr) +void idt_invalidate(void) { - struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 }; + static const struct desc_ptr idt = { .address = 0, .size = 0 }; load_idt(&idt); } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 64b00b0d7fe8..1e34feebcd5d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -232,7 +232,7 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - idt_invalidate(phys_to_virt(0)); + idt_invalidate(); set_gdt(phys_to_virt(0), 0); /* now call it */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index b29657b76e3f..ebfb91108232 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -669,7 +669,7 @@ static void native_machine_emergency_restart(void) break; case BOOT_TRIPLE: - idt_invalidate(NULL); + idt_invalidate(); __asm__ __volatile__("int3"); /* We're probably dead after this, but... */ -- cgit v1.2.3 From 283fa3b6483a84aeb62f1b97c2ec7c02eb2f5882 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Wed, 19 May 2021 14:21:51 -0700 Subject: x86: Add native_[ig]dt_invalidate() In some places, the native forms of descriptor table invalidation is required. Rather than open-coding them, add explicitly native functions to invalidate the GDT and IDT. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210519212154.511983-6-hpa@zytor.com --- arch/x86/include/asm/desc.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index b8429ae50b71..400c17862870 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -224,6 +224,26 @@ static inline void store_idt(struct desc_ptr *dtr) asm volatile("sidt %0":"=m" (*dtr)); } +static inline void native_gdt_invalidate(void) +{ + const struct desc_ptr invalid_gdt = { + .address = 0, + .size = 0 + }; + + native_load_gdt(&invalid_gdt); +} + +static inline void native_idt_invalidate(void) +{ + const struct desc_ptr invalid_idt = { + .address = 0, + .size = 0 + }; + + native_load_idt(&invalid_idt); +} + /* * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is * a read-only remapping. To prevent a page fault, the GDT is switched to the -- cgit v1.2.3 From 056c52f5e824c050c58fd27ea6d717cba32239c2 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Wed, 19 May 2021 14:21:52 -0700 Subject: x86/kexec: Set_[gi]dt() -> native_[gi]dt_invalidate() in machine_kexec_*.c These files contain private set_gdt() functions which are only used to invalid the gdt; machine_kexec_64.c also contains a set_idt() function to invalidate the idt. phys_to_virt(0) *really* doesn't make any sense for creating an invalid GDT. A NULL pointer (virtual 0) makes a lot more sense; although neither will allow any actual memory reference, a NULL pointer stands out more. Replace these calls with native_[gi]dt_invalidate(). Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210519212154.511983-7-hpa@zytor.com --- arch/x86/kernel/machine_kexec_32.c | 15 ++------------- arch/x86/kernel/machine_kexec_64.c | 33 ++------------------------------- 2 files changed, 4 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1e34feebcd5d..1b373d79cedc 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,17 +23,6 @@ #include #include -static void set_gdt(void *newgdt, __u16 limit) -{ - struct desc_ptr curgdt; - - /* ia32 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - load_gdt(&curgdt); -} - static void load_segments(void) { #define __STR(X) #X @@ -232,8 +221,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - idt_invalidate(); - set_gdt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c078b0d3ab0e..131f30fdcfbd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -256,35 +256,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaligned loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - static void load_segments(void) { __asm__ __volatile__ ( @@ -379,8 +350,8 @@ void machine_kexec(struct kimage *image) * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); + native_idt_invalidate(); + native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, -- cgit v1.2.3 From bb11580f61b6c4ba5c35706abd927c8ac8c32852 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Sat, 15 May 2021 10:14:04 +0200 Subject: x86/efi: Log 32/64-bit mismatch with kernel as an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log the message No EFI runtime due to 32/64-bit mismatch with kernel as an error condition, as several things like efivarfs won’t work without the EFI runtime. Signed-off-by: Paul Menzel Signed-off-by: Ard Biesheuvel --- arch/x86/platform/efi/efi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 8a26e705cb06..147c30a81f15 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -468,7 +468,7 @@ void __init efi_init(void) */ if (!efi_runtime_supported()) - pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); + pr_err("No EFI runtime due to 32/64-bit mismatch with kernel\n"); if (!efi_runtime_supported() || efi_runtime_disabled()) { efi_memmap_unmap(); -- cgit v1.2.3 From 778a136e48be6b1b703328a0a4d6d459cf97449f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 18 May 2021 16:43:35 +0200 Subject: KVM: SVM: Drop unneeded CONFIG_X86_LOCAL_APIC check AVIC dependency on CONFIG_X86_LOCAL_APIC is dead code since commit e42eef4ba388 ("KVM: add X86_LOCAL_APIC dependency"). Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210518144339.1987982-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Sean Christopherson --- arch/x86/kvm/svm/avic.c | 2 -- arch/x86/kvm/svm/svm.c | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 712b4e0de481..1c1bf911e02b 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -29,9 +29,7 @@ /* enable / disable AVIC */ int avic; -#ifdef CONFIG_X86_LOCAL_APIC module_param(avic, int, S_IRUGO); -#endif #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dfa351e605de..8c3918a11826 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1010,9 +1010,7 @@ static __init int svm_hardware_setup(void) } if (avic) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { avic = false; } else { pr_info("AVIC enabled\n"); -- cgit v1.2.3 From 377872b3355b9a7f04f25388e2c9399845259c05 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 18 May 2021 16:43:36 +0200 Subject: KVM: VMX: Drop unneeded CONFIG_X86_LOCAL_APIC check CONFIG_X86_LOCAL_APIC is always on when CONFIG_KVM (on x86) since commit e42eef4ba388 ("KVM: add X86_LOCAL_APIC dependency"). Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210518144339.1987982-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Reviewed-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8dee8a5fbc17..aa0e7872fcc9 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -90,8 +90,7 @@ static inline bool cpu_has_vmx_preemption_timer(void) static inline bool cpu_has_vmx_posted_intr(void) { - return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && - vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; } static inline bool cpu_has_load_ia32_efer(void) -- cgit v1.2.3 From 28a4aa1160d71187a44414dac40b57d1fd9fcd77 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 May 2021 18:22:28 +0200 Subject: KVM: SVM: make the avic parameter a bool Make it consistent with kvm_intel.enable_apicv. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/svm/svm.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1c1bf911e02b..0e62e6a2438c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -28,8 +28,8 @@ #include "svm.h" /* enable / disable AVIC */ -int avic; -module_param(avic, int, S_IRUGO); +bool avic; +module_param(avic, bool, S_IRUGO); #define SVM_AVIC_DOORBELL 0xc001011b diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index e44567ceb865..70419e417c0d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -479,7 +479,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern int avic; +extern bool avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { -- cgit v1.2.3 From 2978996f620001f4e748c79af0fe89be729ef58d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Tue, 18 May 2021 12:13:03 -0700 Subject: x86/entry: Use int everywhere for system call numbers System call numbers are defined as int, so use int everywhere for system call numbers. This is strictly a cleanup; it should not change anything user visible; all ABI changes have been done in the preceeding patches. [ tglx: Replaced the unsigned long cast ] Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210518191303.4135296-7-hpa@zytor.com --- arch/x86/entry/common.c | 87 ++++++++++++++++++++++++++++-------------- arch/x86/include/asm/syscall.h | 2 +- 2 files changed, 60 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f51bc17262db..ee95fe3f1518 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -36,49 +36,81 @@ #include #ifdef CONFIG_X86_64 -__visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr) + +static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) +{ + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < NR_syscalls)) { + unr = array_index_nospec(unr, NR_syscalls); + regs->ax = sys_call_table[unr](regs); + return true; + } + return false; +} + +static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) +{ + /* + * Adjust the starting offset of the table, and convert numbers + * < __X32_SYSCALL_BIT to very high and thus out of range + * numbers for comparisons. + */ + unsigned int xnr = nr - __X32_SYSCALL_BIT; + + if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { + xnr = array_index_nospec(xnr, X32_NR_syscalls); + regs->ax = x32_sys_call_table[xnr](regs); + return true; + } + return false; +} + +__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); - if (likely(nr < NR_syscalls)) { - nr = array_index_nospec(nr, NR_syscalls); - regs->ax = sys_call_table[nr](regs); -#ifdef CONFIG_X86_X32_ABI - } else if (likely((nr & __X32_SYSCALL_BIT) && - (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { - nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, - X32_NR_syscalls); - regs->ax = x32_sys_call_table[nr](regs); -#endif - } else if (unlikely((int)nr != -1)) { + + if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { + /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); } + instrumentation_end(); syscall_exit_to_user_mode(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) +static __always_inline int syscall_32_enter(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_IA32_EMULATION)) current_thread_info()->status |= TS_COMPAT; - return (unsigned int)regs->orig_ax; + return (int)regs->orig_ax; } /* * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, - unsigned int nr) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) { - if (likely(nr < IA32_NR_syscalls)) { - nr = array_index_nospec(nr, IA32_NR_syscalls); - regs->ax = ia32_sys_call_table[nr](regs); - } else if (unlikely((int)nr != -1)) { + /* + * Convert negative numbers to very high and thus out of range + * numbers for comparisons. + */ + unsigned int unr = nr; + + if (likely(unr < IA32_NR_syscalls)) { + unr = array_index_nospec(unr, IA32_NR_syscalls); + regs->ax = ia32_sys_call_table[unr](regs); + } else if (nr != -1) { regs->ax = __ia32_sys_ni_syscall(regs); } } @@ -86,15 +118,15 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); add_random_kstack_offset(); /* - * Subtlety here: if ptrace pokes something larger than 2^32-1 into - * orig_ax, the unsigned int return value truncates it. This may - * or may not be necessary, but it matches the old asm behavior. + * Subtlety here: if ptrace pokes something larger than 2^31-1 into + * orig_ax, the int return value truncates it. This matches + * the semantics of syscall_get_nr(). */ - nr = (unsigned int)syscall_enter_from_user_mode(regs, nr); + nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); do_syscall_32_irqs_on(regs, nr); @@ -105,7 +137,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { - unsigned int nr = syscall_32_enter(regs); + int nr = syscall_32_enter(regs); int res; add_random_kstack_offset(); @@ -140,8 +172,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) return false; } - /* The case truncates any ptrace induced syscall nr > 2^32 -1 */ - nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr); + nr = syscall_enter_from_user_mode_work(regs, nr); /* Now this is just like a normal syscall. */ do_syscall_32_irqs_on(regs, nr); diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f6593cafdbd9..f7e2d82d24fb 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -159,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task) ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } -void do_syscall_64(struct pt_regs *regs, unsigned long nr); +void do_syscall_64(struct pt_regs *regs, int nr); void do_int80_syscall_32(struct pt_regs *regs); long do_fast_syscall_32(struct pt_regs *regs); -- cgit v1.2.3 From 1eb8a49836949a77c4f7d738786719e7fde0c333 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 25 May 2021 20:54:20 +0900 Subject: x86/syscalls: Clear 'offset' and 'prefix' in case they are set in env If the environment variable 'prefix' is set on the build host, it is wrongly used as syscall macro prefixes. $ export prefix=/usr $ make -s defconfig all In file included from ./arch/x86/include/asm/unistd.h:20, from :2: ./arch/x86/include/generated/uapi/asm/unistd_64.h:4:9: warning: missing whitespace after the macro name 4 | #define __NR_/usrread 0 | ^~~~~ arch/x86/entry/syscalls/Makefile should clear 'offset' and 'prefix'. Fixes: 3cba325b358f ("x86/syscalls: Switch to generic syscallhdr.sh") Reported-by: Naresh Kamboju Signed-off-by: Masahiro Yamada Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210525115420.679416-1-masahiroy@kernel.org --- arch/x86/entry/syscalls/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile index 8eb014bca8c9..5b3efed0e4e8 100644 --- a/arch/x86/entry/syscalls/Makefile +++ b/arch/x86/entry/syscalls/Makefile @@ -11,6 +11,8 @@ syscall64 := $(src)/syscall_64.tbl syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh +offset := +prefix := quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \ -- cgit v1.2.3 From d48ca5b98fa5d21444e04bb17373d339200b679a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:05 -0400 Subject: x86/uml/syscalls: Remove array index from syscall initializers The recent syscall table generator rework removed the index from the initializers for native x86 syscall tables, but missed the UML syscall tables. Fixes: 44fe4895f47c ("Stop filling syscall arrays with *_sys_ni_syscall") Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-2-brgerst@gmail.com --- arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index f8323104e353..0575decb5e54 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -31,7 +31,7 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5ed665dc785f..95725b5a41ac 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -39,7 +39,7 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, +#define __SYSCALL(nr, sym) sym, extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -- cgit v1.2.3 From fd9e8691f38712892fa2ac73132dcc8b85b07a8f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:06 -0400 Subject: x86/syscalls: Remove -Wno-override-init for syscall tables Commit 44fe4895f47c ("Stop filling syscall arrays with *_sys_ni_syscall") removes the need for -Wno-override-init, since the table is now filled sequentially instead of overriding a default value. Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-3-brgerst@gmail.com --- arch/x86/entry/Makefile | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 08bf95dbc911..94d2843ce80c 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -17,10 +17,6 @@ CFLAGS_syscall_64.o += -fno-stack-protector CFLAGS_syscall_32.o += -fno-stack-protector CFLAGS_syscall_x32.o += -fno-stack-protector -CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,) - obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o -- cgit v1.2.3 From 48f7eee81cd53a94699d28959566b41a9dcac1d9 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 24 May 2021 14:17:07 -0400 Subject: x86/syscalls: Don't adjust CFLAGS for syscall tables The syscall_*.c files only contain data (the syscall tables). There is no need to adjust CFLAGS for tracing and stack protector since they contain no code. Signed-off-by: Brian Gerst Signed-off-by: Thomas Gleixner Reviewed-by: Masahiro Yamada Link: https://lore.kernel.org/r/20210524181707.132844-4-brgerst@gmail.com --- arch/x86/entry/Makefile | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 94d2843ce80c..7fec5dcf6438 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -8,14 +8,8 @@ UBSAN_SANITIZE := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE) CFLAGS_common.o += -fno-stack-protector -CFLAGS_syscall_64.o += -fno-stack-protector -CFLAGS_syscall_32.o += -fno-stack-protector -CFLAGS_syscall_x32.o += -fno-stack-protector obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o -- cgit v1.2.3 From 9be85de97786a75f62080de1c0c13656f65cba84 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 25 May 2021 15:02:00 +0100 Subject: locking/atomic: make ARCH_ATOMIC a Kconfig symbol Subsequent patches will move architectures over to the ARCH_ATOMIC API, after preparing the asm-generic atomic implementations to function with or without ARCH_ATOMIC. As some architectures use the asm-generic implementations exclusively (and don't have a local atomic.h), and to avoid the risk that ARCH_ATOMIC isn't defined in some cases we expect, let's make the ARCH_ATOMIC macro a Kconfig symbol instead, so that we can guarantee it is consistently available where needed. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Boqun Feng Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210525140232.53872-2-mark.rutland@arm.com --- arch/Kconfig | 3 +++ arch/arm64/Kconfig | 1 + arch/arm64/include/asm/atomic.h | 2 -- arch/s390/Kconfig | 1 + arch/s390/include/asm/atomic.h | 2 -- arch/um/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/include/asm/atomic.h | 2 -- include/linux/atomic.h | 2 +- 9 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index c45b770d3579..3fb3b12d4a95 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -11,6 +11,9 @@ source "arch/$(SRCARCH)/Kconfig" menu "General architecture-dependent options" +config ARCH_ATOMIC + bool + config CRASH_CORE bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f1d8566bbf9..62ab429d1f42 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,6 +9,7 @@ config ARM64 select ACPI_MCFG if (ACPI && PCI) select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI + select ARCH_ATOMIC select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index b56a4b2bc248..c9979273d389 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -223,6 +223,4 @@ static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v) #define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive -#define ARCH_ATOMIC - #endif /* __ASM_ATOMIC_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b4c7c34069f8..85374a36c69e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -58,6 +58,7 @@ config S390 # Note: keep this list sorted alphabetically # imply IMA_SECURE_AND_OR_TRUSTED_BOOT + select ARCH_ATOMIC select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index 7c93c6573524..7138d189cc42 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -147,6 +147,4 @@ ATOMIC64_OPS(xor) #define arch_atomic64_fetch_sub(_i, _v) arch_atomic64_fetch_add(-(s64)(_i), _v) #define arch_atomic64_sub(_i, _v) arch_atomic64_add(-(s64)(_i), _v) -#define ARCH_ATOMIC - #endif /* __ARCH_S390_ATOMIC__ */ diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 57cfd9a1c082..4370a9521ea4 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_ATOMIC select ARCH_EPHEMERAL_INODES select ARCH_HAS_KCOV select ARCH_NO_PREEMPT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..11a27563033d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -58,6 +58,7 @@ config X86 # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI + select ARCH_ATOMIC select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index f732741ad7c7..5e754e895767 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -269,6 +269,4 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) # include #endif -#define ARCH_ATOMIC - #endif /* _ASM_X86_ATOMIC_H */ diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 571a11008ab5..4f8d83f9e480 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -77,7 +77,7 @@ __ret; \ }) -#ifdef ARCH_ATOMIC +#ifdef CONFIG_ARCH_ATOMIC #include #include #else -- cgit v1.2.3 From 3c1885187bc1faa0a1c52f7bd34550740a208169 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 25 May 2021 15:02:31 +0100 Subject: locking/atomic: delete !ARCH_ATOMIC remnants Now that all architectures implement ARCH_ATOMIC, we can make it mandatory, removing the Kconfig symbol and logic for !ARCH_ATOMIC. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Acked-by: Geert Uytterhoeven Cc: Boqun Feng Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210525140232.53872-33-mark.rutland@arm.com --- arch/Kconfig | 3 - arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nds32/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/um/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - include/asm-generic/atomic.h | 44 +- include/asm-generic/atomic64.h | 29 - include/asm-generic/cmpxchg.h | 21 - include/linux/atomic-fallback.h | 2595 --------------------------------------- include/linux/atomic.h | 4 - scripts/atomic/check-atomics.sh | 1 - scripts/atomic/gen-atomics.sh | 1 - 31 files changed, 3 insertions(+), 2718 deletions(-) delete mode 100644 include/linux/atomic-fallback.h (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 3fb3b12d4a95..c45b770d3579 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -11,9 +11,6 @@ source "arch/$(SRCARCH)/Kconfig" menu "General architecture-dependent options" -config ARCH_ATOMIC - bool - config CRASH_CORE bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 7920fc2e2a2a..5998106faa60 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -2,7 +2,6 @@ config ALPHA bool default y - select ARCH_ATOMIC select ARCH_32BIT_USTAT_F_TINODE select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 098ecc72d048..2d98501c0897 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,7 +6,6 @@ config ARC def_bool y select ARC_TIMERS - select ARCH_ATOMIC select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b7334a6643b9..24804f11302d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -3,7 +3,6 @@ config ARM bool default y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 62ab429d1f42..9f1d8566bbf9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,7 +9,6 @@ config ARM64 select ACPI_MCFG if (ACPI && PCI) select ACPI_SPCR_TABLE if ACPI select ACPI_PPTT if ACPI - select ARCH_ATOMIC select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 3521f14bcd96..8de5b987edb9 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -2,7 +2,6 @@ config CSKY def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index bdf05ad3206a..3e3e0f16f7e0 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -2,7 +2,6 @@ config H8300 def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_BINFMT_FLAT select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BINFMT_FLAT_OLD_ALWAYS_RAM diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 1368954ef679..44a409967af1 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -5,7 +5,6 @@ comment "Linux Kernel Configuration for Hexagon" config HEXAGON def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_NO_PREEMPT # Other pending projects/to-do items. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c5414dcd5d0d..279252e3e0f7 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -8,7 +8,6 @@ menu "Processor type and features" config IA64 bool - select ARCH_ATOMIC select ARCH_HAS_DMA_MARK_CLEAN select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index d1d91ac47f51..372e4e69c43a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -3,7 +3,6 @@ config M68K bool default y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 5a52922dc225..0660f47012bc 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -2,7 +2,6 @@ config MICROBLAZE def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_NO_SWAP select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 55b4da96872f..ed51970c08e7 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3,7 +3,6 @@ config MIPS bool default y select ARCH_32BIT_OFF_T if !64BIT - select ARCH_ATOMIC select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_HAS_DEBUG_VIRTUAL if !64BIT select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 352913573aee..62313902d75d 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -7,7 +7,6 @@ config NDS32 def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 67dae88c5b53..c24955c81c92 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -2,7 +2,6 @@ config NIOS2 def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 8c50bc9674f5..591acc5990dc 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -7,7 +7,6 @@ config OPENRISC def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_DMA_SET_UNCACHED select ARCH_HAS_DMA_CLEAR_UNCACHED select ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index bfa120a4add1..bde9907bc5b2 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -2,7 +2,6 @@ config PARISC def_bool y select ARCH_32BIT_OFF_T if !64BIT - select ARCH_ATOMIC select ARCH_MIGHT_HAVE_PC_PARPORT select HAVE_IDE select HAVE_FUNCTION_TRACER diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d143c2b616f0..088dd2afcfe4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -118,7 +118,6 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 - select ARCH_ATOMIC select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_COPY_MC if PPC64 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index c59b9f4a9d62..a8ad8eb76120 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -12,7 +12,6 @@ config 32BIT config RISCV def_bool y - select ARCH_ATOMIC select ARCH_CLOCKSOURCE_INIT select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 85374a36c69e..b4c7c34069f8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -58,7 +58,6 @@ config S390 # Note: keep this list sorted alphabetically # imply IMA_SECURE_AND_OR_TRUSTED_BOOT - select ARCH_ATOMIC select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index d2925cbb6fa4..68129537e350 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -2,7 +2,6 @@ config SUPERH def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 46790083e918..164a5254c91c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -13,7 +13,6 @@ config 64BIT config SPARC bool default y - select ARCH_ATOMIC select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO select DMA_OPS diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4370a9521ea4..57cfd9a1c082 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,7 +5,6 @@ menu "UML-specific options" config UML bool default y - select ARCH_ATOMIC select ARCH_EPHEMERAL_INODES select ARCH_HAS_KCOV select ARCH_NO_PREEMPT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 11a27563033d..0045e1b44190 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -58,7 +58,6 @@ config X86 # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ARCH_ATOMIC select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 39bb9bdae6b1..2332b2156993 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -2,7 +2,6 @@ config XTENSA def_bool y select ARCH_32BIT_OFF_T - select ARCH_ATOMIC select ARCH_HAS_BINFMT_FLAT if !MMU select ARCH_HAS_DMA_PREP_COHERENT if MMU select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 649060fa0fe8..04b8be9f1a77 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -12,14 +12,6 @@ #include #include -#ifdef CONFIG_ARCH_ATOMIC -#define __ga_cmpxchg arch_cmpxchg -#define __ga_xchg arch_xchg -#else -#define __ga_cmpxchg cmpxchg -#define __ga_xchg xchg -#endif - #ifdef CONFIG_SMP /* we can build all atomic primitives from cmpxchg */ @@ -30,7 +22,7 @@ static inline void generic_atomic_##op(int i, atomic_t *v) \ int c, old; \ \ c = v->counter; \ - while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c) \ + while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c) \ c = old; \ } @@ -40,7 +32,7 @@ static inline int generic_atomic_##op##_return(int i, atomic_t *v) \ int c, old; \ \ c = v->counter; \ - while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c) \ + while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c) \ c = old; \ \ return c c_op i; \ @@ -52,7 +44,7 @@ static inline int generic_atomic_fetch_##op(int i, atomic_t *v) \ int c, old; \ \ c = v->counter; \ - while ((old = __ga_cmpxchg(&v->counter, c, c c_op i)) != c) \ + while ((old = arch_cmpxchg(&v->counter, c, c c_op i)) != c) \ c = old; \ \ return c; \ @@ -120,11 +112,6 @@ ATOMIC_OP(xor, ^) #undef ATOMIC_OP_RETURN #undef ATOMIC_OP -#undef __ga_cmpxchg -#undef __ga_xchg - -#ifdef CONFIG_ARCH_ATOMIC - #define arch_atomic_add_return generic_atomic_add_return #define arch_atomic_sub_return generic_atomic_sub_return @@ -146,29 +133,4 @@ ATOMIC_OP(xor, ^) #define arch_atomic_xchg(ptr, v) (arch_xchg(&(ptr)->counter, (v))) #define arch_atomic_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), (old), (new))) -#else /* CONFIG_ARCH_ATOMIC */ - -#define atomic_add_return generic_atomic_add_return -#define atomic_sub_return generic_atomic_sub_return - -#define atomic_fetch_add generic_atomic_fetch_add -#define atomic_fetch_sub generic_atomic_fetch_sub -#define atomic_fetch_and generic_atomic_fetch_and -#define atomic_fetch_or generic_atomic_fetch_or -#define atomic_fetch_xor generic_atomic_fetch_xor - -#define atomic_add generic_atomic_add -#define atomic_sub generic_atomic_sub -#define atomic_and generic_atomic_and -#define atomic_or generic_atomic_or -#define atomic_xor generic_atomic_xor - -#define atomic_read(v) READ_ONCE((v)->counter) -#define atomic_set(v, i) WRITE_ONCE(((v)->counter), (i)) - -#define atomic_xchg(ptr, v) (xchg(&(ptr)->counter, (v))) -#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) - -#endif /* CONFIG_ARCH_ATOMIC */ - #endif /* __ASM_GENERIC_ATOMIC_H */ diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index c8c7d9fae820..100d24b02e52 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -49,8 +49,6 @@ extern s64 generic_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n); extern s64 generic_atomic64_xchg(atomic64_t *v, s64 new); extern s64 generic_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u); -#ifdef CONFIG_ARCH_ATOMIC - #define arch_atomic64_read generic_atomic64_read #define arch_atomic64_set generic_atomic64_set #define arch_atomic64_set_release generic_atomic64_set @@ -74,31 +72,4 @@ extern s64 generic_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u); #define arch_atomic64_xchg generic_atomic64_xchg #define arch_atomic64_fetch_add_unless generic_atomic64_fetch_add_unless -#else /* CONFIG_ARCH_ATOMIC */ - -#define atomic64_read generic_atomic64_read -#define atomic64_set generic_atomic64_set -#define atomic64_set_release generic_atomic64_set - -#define atomic64_add generic_atomic64_add -#define atomic64_add_return generic_atomic64_add_return -#define atomic64_fetch_add generic_atomic64_fetch_add -#define atomic64_sub generic_atomic64_sub -#define atomic64_sub_return generic_atomic64_sub_return -#define atomic64_fetch_sub generic_atomic64_fetch_sub - -#define atomic64_and generic_atomic64_and -#define atomic64_fetch_and generic_atomic64_fetch_and -#define atomic64_or generic_atomic64_or -#define atomic64_fetch_or generic_atomic64_fetch_or -#define atomic64_xor generic_atomic64_xor -#define atomic64_fetch_xor generic_atomic64_fetch_xor - -#define atomic64_dec_if_positive generic_atomic64_dec_if_positive -#define atomic64_cmpxchg generic_atomic64_cmpxchg -#define atomic64_xchg generic_atomic64_xchg -#define atomic64_fetch_add_unless generic_atomic64_fetch_add_unless - -#endif /* CONFIG_ARCH_ATOMIC */ - #endif /* _ASM_GENERIC_ATOMIC64_H */ diff --git a/include/asm-generic/cmpxchg.h b/include/asm-generic/cmpxchg.h index 98c931199089..dca4419922a9 100644 --- a/include/asm-generic/cmpxchg.h +++ b/include/asm-generic/cmpxchg.h @@ -97,8 +97,6 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size) __generic_cmpxchg64_local((ptr), (o), (n)) -#ifdef CONFIG_ARCH_ATOMIC - #ifndef arch_xchg #define arch_xchg generic_xchg #endif @@ -114,23 +112,4 @@ unsigned long __generic_xchg(unsigned long x, volatile void *ptr, int size) #define arch_cmpxchg arch_cmpxchg_local #define arch_cmpxchg64 arch_cmpxchg64_local -#else /* CONFIG_ARCH_ATOMIC */ - -#ifndef xchg -#define xchg generic_xchg -#endif - -#ifndef cmpxchg_local -#define cmpxchg_local generic_cmpxchg_local -#endif - -#ifndef cmpxchg64_local -#define cmpxchg64_local generic_cmpxchg64_local -#endif - -#define cmpxchg cmpxchg_local -#define cmpxchg64 cmpxchg64_local - -#endif /* CONFIG_ARCH_ATOMIC */ - #endif /* __ASM_GENERIC_CMPXCHG_H */ diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h deleted file mode 100644 index 2a3f55d98be9..000000000000 --- a/include/linux/atomic-fallback.h +++ /dev/null @@ -1,2595 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -// Generated by scripts/atomic/gen-atomic-fallback.sh -// DO NOT MODIFY THIS FILE DIRECTLY - -#ifndef _LINUX_ATOMIC_FALLBACK_H -#define _LINUX_ATOMIC_FALLBACK_H - -#include - -#ifndef xchg_relaxed -#define xchg_acquire xchg -#define xchg_release xchg -#define xchg_relaxed xchg -#else /* xchg_relaxed */ - -#ifndef xchg_acquire -#define xchg_acquire(...) \ - __atomic_op_acquire(xchg, __VA_ARGS__) -#endif - -#ifndef xchg_release -#define xchg_release(...) \ - __atomic_op_release(xchg, __VA_ARGS__) -#endif - -#ifndef xchg -#define xchg(...) \ - __atomic_op_fence(xchg, __VA_ARGS__) -#endif - -#endif /* xchg_relaxed */ - -#ifndef cmpxchg_relaxed -#define cmpxchg_acquire cmpxchg -#define cmpxchg_release cmpxchg -#define cmpxchg_relaxed cmpxchg -#else /* cmpxchg_relaxed */ - -#ifndef cmpxchg_acquire -#define cmpxchg_acquire(...) \ - __atomic_op_acquire(cmpxchg, __VA_ARGS__) -#endif - -#ifndef cmpxchg_release -#define cmpxchg_release(...) \ - __atomic_op_release(cmpxchg, __VA_ARGS__) -#endif - -#ifndef cmpxchg -#define cmpxchg(...) \ - __atomic_op_fence(cmpxchg, __VA_ARGS__) -#endif - -#endif /* cmpxchg_relaxed */ - -#ifndef cmpxchg64_relaxed -#define cmpxchg64_acquire cmpxchg64 -#define cmpxchg64_release cmpxchg64 -#define cmpxchg64_relaxed cmpxchg64 -#else /* cmpxchg64_relaxed */ - -#ifndef cmpxchg64_acquire -#define cmpxchg64_acquire(...) \ - __atomic_op_acquire(cmpxchg64, __VA_ARGS__) -#endif - -#ifndef cmpxchg64_release -#define cmpxchg64_release(...) \ - __atomic_op_release(cmpxchg64, __VA_ARGS__) -#endif - -#ifndef cmpxchg64 -#define cmpxchg64(...) \ - __atomic_op_fence(cmpxchg64, __VA_ARGS__) -#endif - -#endif /* cmpxchg64_relaxed */ - -#ifndef try_cmpxchg_relaxed -#ifdef try_cmpxchg -#define try_cmpxchg_acquire try_cmpxchg -#define try_cmpxchg_release try_cmpxchg -#define try_cmpxchg_relaxed try_cmpxchg -#endif /* try_cmpxchg */ - -#ifndef try_cmpxchg -#define try_cmpxchg(_ptr, _oldp, _new) \ -({ \ - typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ - ___r = cmpxchg((_ptr), ___o, (_new)); \ - if (unlikely(___r != ___o)) \ - *___op = ___r; \ - likely(___r == ___o); \ -}) -#endif /* try_cmpxchg */ - -#ifndef try_cmpxchg_acquire -#define try_cmpxchg_acquire(_ptr, _oldp, _new) \ -({ \ - typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ - ___r = cmpxchg_acquire((_ptr), ___o, (_new)); \ - if (unlikely(___r != ___o)) \ - *___op = ___r; \ - likely(___r == ___o); \ -}) -#endif /* try_cmpxchg_acquire */ - -#ifndef try_cmpxchg_release -#define try_cmpxchg_release(_ptr, _oldp, _new) \ -({ \ - typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ - ___r = cmpxchg_release((_ptr), ___o, (_new)); \ - if (unlikely(___r != ___o)) \ - *___op = ___r; \ - likely(___r == ___o); \ -}) -#endif /* try_cmpxchg_release */ - -#ifndef try_cmpxchg_relaxed -#define try_cmpxchg_relaxed(_ptr, _oldp, _new) \ -({ \ - typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \ - ___r = cmpxchg_relaxed((_ptr), ___o, (_new)); \ - if (unlikely(___r != ___o)) \ - *___op = ___r; \ - likely(___r == ___o); \ -}) -#endif /* try_cmpxchg_relaxed */ - -#else /* try_cmpxchg_relaxed */ - -#ifndef try_cmpxchg_acquire -#define try_cmpxchg_acquire(...) \ - __atomic_op_acquire(try_cmpxchg, __VA_ARGS__) -#endif - -#ifndef try_cmpxchg_release -#define try_cmpxchg_release(...) \ - __atomic_op_release(try_cmpxchg, __VA_ARGS__) -#endif - -#ifndef try_cmpxchg -#define try_cmpxchg(...) \ - __atomic_op_fence(try_cmpxchg, __VA_ARGS__) -#endif - -#endif /* try_cmpxchg_relaxed */ - -#define arch_atomic_read atomic_read -#define arch_atomic_read_acquire atomic_read_acquire - -#ifndef atomic_read_acquire -static __always_inline int -atomic_read_acquire(const atomic_t *v) -{ - return smp_load_acquire(&(v)->counter); -} -#define atomic_read_acquire atomic_read_acquire -#endif - -#define arch_atomic_set atomic_set -#define arch_atomic_set_release atomic_set_release - -#ifndef atomic_set_release -static __always_inline void -atomic_set_release(atomic_t *v, int i) -{ - smp_store_release(&(v)->counter, i); -} -#define atomic_set_release atomic_set_release -#endif - -#define arch_atomic_add atomic_add - -#define arch_atomic_add_return atomic_add_return -#define arch_atomic_add_return_acquire atomic_add_return_acquire -#define arch_atomic_add_return_release atomic_add_return_release -#define arch_atomic_add_return_relaxed atomic_add_return_relaxed - -#ifndef atomic_add_return_relaxed -#define atomic_add_return_acquire atomic_add_return -#define atomic_add_return_release atomic_add_return -#define atomic_add_return_relaxed atomic_add_return -#else /* atomic_add_return_relaxed */ - -#ifndef atomic_add_return_acquire -static __always_inline int -atomic_add_return_acquire(int i, atomic_t *v) -{ - int ret = atomic_add_return_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_add_return_acquire atomic_add_return_acquire -#endif - -#ifndef atomic_add_return_release -static __always_inline int -atomic_add_return_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_add_return_relaxed(i, v); -} -#define atomic_add_return_release atomic_add_return_release -#endif - -#ifndef atomic_add_return -static __always_inline int -atomic_add_return(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_add_return_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_add_return atomic_add_return -#endif - -#endif /* atomic_add_return_relaxed */ - -#define arch_atomic_fetch_add atomic_fetch_add -#define arch_atomic_fetch_add_acquire atomic_fetch_add_acquire -#define arch_atomic_fetch_add_release atomic_fetch_add_release -#define arch_atomic_fetch_add_relaxed atomic_fetch_add_relaxed - -#ifndef atomic_fetch_add_relaxed -#define atomic_fetch_add_acquire atomic_fetch_add -#define atomic_fetch_add_release atomic_fetch_add -#define atomic_fetch_add_relaxed atomic_fetch_add -#else /* atomic_fetch_add_relaxed */ - -#ifndef atomic_fetch_add_acquire -static __always_inline int -atomic_fetch_add_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_add_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_add_acquire atomic_fetch_add_acquire -#endif - -#ifndef atomic_fetch_add_release -static __always_inline int -atomic_fetch_add_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_add_relaxed(i, v); -} -#define atomic_fetch_add_release atomic_fetch_add_release -#endif - -#ifndef atomic_fetch_add -static __always_inline int -atomic_fetch_add(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_add_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_add atomic_fetch_add -#endif - -#endif /* atomic_fetch_add_relaxed */ - -#define arch_atomic_sub atomic_sub - -#define arch_atomic_sub_return atomic_sub_return -#define arch_atomic_sub_return_acquire atomic_sub_return_acquire -#define arch_atomic_sub_return_release atomic_sub_return_release -#define arch_atomic_sub_return_relaxed atomic_sub_return_relaxed - -#ifndef atomic_sub_return_relaxed -#define atomic_sub_return_acquire atomic_sub_return -#define atomic_sub_return_release atomic_sub_return -#define atomic_sub_return_relaxed atomic_sub_return -#else /* atomic_sub_return_relaxed */ - -#ifndef atomic_sub_return_acquire -static __always_inline int -atomic_sub_return_acquire(int i, atomic_t *v) -{ - int ret = atomic_sub_return_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_sub_return_acquire atomic_sub_return_acquire -#endif - -#ifndef atomic_sub_return_release -static __always_inline int -atomic_sub_return_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_sub_return_relaxed(i, v); -} -#define atomic_sub_return_release atomic_sub_return_release -#endif - -#ifndef atomic_sub_return -static __always_inline int -atomic_sub_return(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_sub_return_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_sub_return atomic_sub_return -#endif - -#endif /* atomic_sub_return_relaxed */ - -#define arch_atomic_fetch_sub atomic_fetch_sub -#define arch_atomic_fetch_sub_acquire atomic_fetch_sub_acquire -#define arch_atomic_fetch_sub_release atomic_fetch_sub_release -#define arch_atomic_fetch_sub_relaxed atomic_fetch_sub_relaxed - -#ifndef atomic_fetch_sub_relaxed -#define atomic_fetch_sub_acquire atomic_fetch_sub -#define atomic_fetch_sub_release atomic_fetch_sub -#define atomic_fetch_sub_relaxed atomic_fetch_sub -#else /* atomic_fetch_sub_relaxed */ - -#ifndef atomic_fetch_sub_acquire -static __always_inline int -atomic_fetch_sub_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_sub_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_sub_acquire atomic_fetch_sub_acquire -#endif - -#ifndef atomic_fetch_sub_release -static __always_inline int -atomic_fetch_sub_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_sub_relaxed(i, v); -} -#define atomic_fetch_sub_release atomic_fetch_sub_release -#endif - -#ifndef atomic_fetch_sub -static __always_inline int -atomic_fetch_sub(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_sub_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_sub atomic_fetch_sub -#endif - -#endif /* atomic_fetch_sub_relaxed */ - -#define arch_atomic_inc atomic_inc - -#ifndef atomic_inc -static __always_inline void -atomic_inc(atomic_t *v) -{ - atomic_add(1, v); -} -#define atomic_inc atomic_inc -#endif - -#define arch_atomic_inc_return atomic_inc_return -#define arch_atomic_inc_return_acquire atomic_inc_return_acquire -#define arch_atomic_inc_return_release atomic_inc_return_release -#define arch_atomic_inc_return_relaxed atomic_inc_return_relaxed - -#ifndef atomic_inc_return_relaxed -#ifdef atomic_inc_return -#define atomic_inc_return_acquire atomic_inc_return -#define atomic_inc_return_release atomic_inc_return -#define atomic_inc_return_relaxed atomic_inc_return -#endif /* atomic_inc_return */ - -#ifndef atomic_inc_return -static __always_inline int -atomic_inc_return(atomic_t *v) -{ - return atomic_add_return(1, v); -} -#define atomic_inc_return atomic_inc_return -#endif - -#ifndef atomic_inc_return_acquire -static __always_inline int -atomic_inc_return_acquire(atomic_t *v) -{ - return atomic_add_return_acquire(1, v); -} -#define atomic_inc_return_acquire atomic_inc_return_acquire -#endif - -#ifndef atomic_inc_return_release -static __always_inline int -atomic_inc_return_release(atomic_t *v) -{ - return atomic_add_return_release(1, v); -} -#define atomic_inc_return_release atomic_inc_return_release -#endif - -#ifndef atomic_inc_return_relaxed -static __always_inline int -atomic_inc_return_relaxed(atomic_t *v) -{ - return atomic_add_return_relaxed(1, v); -} -#define atomic_inc_return_relaxed atomic_inc_return_relaxed -#endif - -#else /* atomic_inc_return_relaxed */ - -#ifndef atomic_inc_return_acquire -static __always_inline int -atomic_inc_return_acquire(atomic_t *v) -{ - int ret = atomic_inc_return_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_inc_return_acquire atomic_inc_return_acquire -#endif - -#ifndef atomic_inc_return_release -static __always_inline int -atomic_inc_return_release(atomic_t *v) -{ - __atomic_release_fence(); - return atomic_inc_return_relaxed(v); -} -#define atomic_inc_return_release atomic_inc_return_release -#endif - -#ifndef atomic_inc_return -static __always_inline int -atomic_inc_return(atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_inc_return_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_inc_return atomic_inc_return -#endif - -#endif /* atomic_inc_return_relaxed */ - -#define arch_atomic_fetch_inc atomic_fetch_inc -#define arch_atomic_fetch_inc_acquire atomic_fetch_inc_acquire -#define arch_atomic_fetch_inc_release atomic_fetch_inc_release -#define arch_atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed - -#ifndef atomic_fetch_inc_relaxed -#ifdef atomic_fetch_inc -#define atomic_fetch_inc_acquire atomic_fetch_inc -#define atomic_fetch_inc_release atomic_fetch_inc -#define atomic_fetch_inc_relaxed atomic_fetch_inc -#endif /* atomic_fetch_inc */ - -#ifndef atomic_fetch_inc -static __always_inline int -atomic_fetch_inc(atomic_t *v) -{ - return atomic_fetch_add(1, v); -} -#define atomic_fetch_inc atomic_fetch_inc -#endif - -#ifndef atomic_fetch_inc_acquire -static __always_inline int -atomic_fetch_inc_acquire(atomic_t *v) -{ - return atomic_fetch_add_acquire(1, v); -} -#define atomic_fetch_inc_acquire atomic_fetch_inc_acquire -#endif - -#ifndef atomic_fetch_inc_release -static __always_inline int -atomic_fetch_inc_release(atomic_t *v) -{ - return atomic_fetch_add_release(1, v); -} -#define atomic_fetch_inc_release atomic_fetch_inc_release -#endif - -#ifndef atomic_fetch_inc_relaxed -static __always_inline int -atomic_fetch_inc_relaxed(atomic_t *v) -{ - return atomic_fetch_add_relaxed(1, v); -} -#define atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed -#endif - -#else /* atomic_fetch_inc_relaxed */ - -#ifndef atomic_fetch_inc_acquire -static __always_inline int -atomic_fetch_inc_acquire(atomic_t *v) -{ - int ret = atomic_fetch_inc_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_inc_acquire atomic_fetch_inc_acquire -#endif - -#ifndef atomic_fetch_inc_release -static __always_inline int -atomic_fetch_inc_release(atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_inc_relaxed(v); -} -#define atomic_fetch_inc_release atomic_fetch_inc_release -#endif - -#ifndef atomic_fetch_inc -static __always_inline int -atomic_fetch_inc(atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_inc_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_inc atomic_fetch_inc -#endif - -#endif /* atomic_fetch_inc_relaxed */ - -#define arch_atomic_dec atomic_dec - -#ifndef atomic_dec -static __always_inline void -atomic_dec(atomic_t *v) -{ - atomic_sub(1, v); -} -#define atomic_dec atomic_dec -#endif - -#define arch_atomic_dec_return atomic_dec_return -#define arch_atomic_dec_return_acquire atomic_dec_return_acquire -#define arch_atomic_dec_return_release atomic_dec_return_release -#define arch_atomic_dec_return_relaxed atomic_dec_return_relaxed - -#ifndef atomic_dec_return_relaxed -#ifdef atomic_dec_return -#define atomic_dec_return_acquire atomic_dec_return -#define atomic_dec_return_release atomic_dec_return -#define atomic_dec_return_relaxed atomic_dec_return -#endif /* atomic_dec_return */ - -#ifndef atomic_dec_return -static __always_inline int -atomic_dec_return(atomic_t *v) -{ - return atomic_sub_return(1, v); -} -#define atomic_dec_return atomic_dec_return -#endif - -#ifndef atomic_dec_return_acquire -static __always_inline int -atomic_dec_return_acquire(atomic_t *v) -{ - return atomic_sub_return_acquire(1, v); -} -#define atomic_dec_return_acquire atomic_dec_return_acquire -#endif - -#ifndef atomic_dec_return_release -static __always_inline int -atomic_dec_return_release(atomic_t *v) -{ - return atomic_sub_return_release(1, v); -} -#define atomic_dec_return_release atomic_dec_return_release -#endif - -#ifndef atomic_dec_return_relaxed -static __always_inline int -atomic_dec_return_relaxed(atomic_t *v) -{ - return atomic_sub_return_relaxed(1, v); -} -#define atomic_dec_return_relaxed atomic_dec_return_relaxed -#endif - -#else /* atomic_dec_return_relaxed */ - -#ifndef atomic_dec_return_acquire -static __always_inline int -atomic_dec_return_acquire(atomic_t *v) -{ - int ret = atomic_dec_return_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_dec_return_acquire atomic_dec_return_acquire -#endif - -#ifndef atomic_dec_return_release -static __always_inline int -atomic_dec_return_release(atomic_t *v) -{ - __atomic_release_fence(); - return atomic_dec_return_relaxed(v); -} -#define atomic_dec_return_release atomic_dec_return_release -#endif - -#ifndef atomic_dec_return -static __always_inline int -atomic_dec_return(atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_dec_return_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_dec_return atomic_dec_return -#endif - -#endif /* atomic_dec_return_relaxed */ - -#define arch_atomic_fetch_dec atomic_fetch_dec -#define arch_atomic_fetch_dec_acquire atomic_fetch_dec_acquire -#define arch_atomic_fetch_dec_release atomic_fetch_dec_release -#define arch_atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed - -#ifndef atomic_fetch_dec_relaxed -#ifdef atomic_fetch_dec -#define atomic_fetch_dec_acquire atomic_fetch_dec -#define atomic_fetch_dec_release atomic_fetch_dec -#define atomic_fetch_dec_relaxed atomic_fetch_dec -#endif /* atomic_fetch_dec */ - -#ifndef atomic_fetch_dec -static __always_inline int -atomic_fetch_dec(atomic_t *v) -{ - return atomic_fetch_sub(1, v); -} -#define atomic_fetch_dec atomic_fetch_dec -#endif - -#ifndef atomic_fetch_dec_acquire -static __always_inline int -atomic_fetch_dec_acquire(atomic_t *v) -{ - return atomic_fetch_sub_acquire(1, v); -} -#define atomic_fetch_dec_acquire atomic_fetch_dec_acquire -#endif - -#ifndef atomic_fetch_dec_release -static __always_inline int -atomic_fetch_dec_release(atomic_t *v) -{ - return atomic_fetch_sub_release(1, v); -} -#define atomic_fetch_dec_release atomic_fetch_dec_release -#endif - -#ifndef atomic_fetch_dec_relaxed -static __always_inline int -atomic_fetch_dec_relaxed(atomic_t *v) -{ - return atomic_fetch_sub_relaxed(1, v); -} -#define atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed -#endif - -#else /* atomic_fetch_dec_relaxed */ - -#ifndef atomic_fetch_dec_acquire -static __always_inline int -atomic_fetch_dec_acquire(atomic_t *v) -{ - int ret = atomic_fetch_dec_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_dec_acquire atomic_fetch_dec_acquire -#endif - -#ifndef atomic_fetch_dec_release -static __always_inline int -atomic_fetch_dec_release(atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_dec_relaxed(v); -} -#define atomic_fetch_dec_release atomic_fetch_dec_release -#endif - -#ifndef atomic_fetch_dec -static __always_inline int -atomic_fetch_dec(atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_dec_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_dec atomic_fetch_dec -#endif - -#endif /* atomic_fetch_dec_relaxed */ - -#define arch_atomic_and atomic_and - -#define arch_atomic_fetch_and atomic_fetch_and -#define arch_atomic_fetch_and_acquire atomic_fetch_and_acquire -#define arch_atomic_fetch_and_release atomic_fetch_and_release -#define arch_atomic_fetch_and_relaxed atomic_fetch_and_relaxed - -#ifndef atomic_fetch_and_relaxed -#define atomic_fetch_and_acquire atomic_fetch_and -#define atomic_fetch_and_release atomic_fetch_and -#define atomic_fetch_and_relaxed atomic_fetch_and -#else /* atomic_fetch_and_relaxed */ - -#ifndef atomic_fetch_and_acquire -static __always_inline int -atomic_fetch_and_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_and_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_and_acquire atomic_fetch_and_acquire -#endif - -#ifndef atomic_fetch_and_release -static __always_inline int -atomic_fetch_and_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_and_relaxed(i, v); -} -#define atomic_fetch_and_release atomic_fetch_and_release -#endif - -#ifndef atomic_fetch_and -static __always_inline int -atomic_fetch_and(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_and_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_and atomic_fetch_and -#endif - -#endif /* atomic_fetch_and_relaxed */ - -#define arch_atomic_andnot atomic_andnot - -#ifndef atomic_andnot -static __always_inline void -atomic_andnot(int i, atomic_t *v) -{ - atomic_and(~i, v); -} -#define atomic_andnot atomic_andnot -#endif - -#define arch_atomic_fetch_andnot atomic_fetch_andnot -#define arch_atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire -#define arch_atomic_fetch_andnot_release atomic_fetch_andnot_release -#define arch_atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed - -#ifndef atomic_fetch_andnot_relaxed -#ifdef atomic_fetch_andnot -#define atomic_fetch_andnot_acquire atomic_fetch_andnot -#define atomic_fetch_andnot_release atomic_fetch_andnot -#define atomic_fetch_andnot_relaxed atomic_fetch_andnot -#endif /* atomic_fetch_andnot */ - -#ifndef atomic_fetch_andnot -static __always_inline int -atomic_fetch_andnot(int i, atomic_t *v) -{ - return atomic_fetch_and(~i, v); -} -#define atomic_fetch_andnot atomic_fetch_andnot -#endif - -#ifndef atomic_fetch_andnot_acquire -static __always_inline int -atomic_fetch_andnot_acquire(int i, atomic_t *v) -{ - return atomic_fetch_and_acquire(~i, v); -} -#define atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire -#endif - -#ifndef atomic_fetch_andnot_release -static __always_inline int -atomic_fetch_andnot_release(int i, atomic_t *v) -{ - return atomic_fetch_and_release(~i, v); -} -#define atomic_fetch_andnot_release atomic_fetch_andnot_release -#endif - -#ifndef atomic_fetch_andnot_relaxed -static __always_inline int -atomic_fetch_andnot_relaxed(int i, atomic_t *v) -{ - return atomic_fetch_and_relaxed(~i, v); -} -#define atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed -#endif - -#else /* atomic_fetch_andnot_relaxed */ - -#ifndef atomic_fetch_andnot_acquire -static __always_inline int -atomic_fetch_andnot_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_andnot_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire -#endif - -#ifndef atomic_fetch_andnot_release -static __always_inline int -atomic_fetch_andnot_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_andnot_relaxed(i, v); -} -#define atomic_fetch_andnot_release atomic_fetch_andnot_release -#endif - -#ifndef atomic_fetch_andnot -static __always_inline int -atomic_fetch_andnot(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_andnot_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_andnot atomic_fetch_andnot -#endif - -#endif /* atomic_fetch_andnot_relaxed */ - -#define arch_atomic_or atomic_or - -#define arch_atomic_fetch_or atomic_fetch_or -#define arch_atomic_fetch_or_acquire atomic_fetch_or_acquire -#define arch_atomic_fetch_or_release atomic_fetch_or_release -#define arch_atomic_fetch_or_relaxed atomic_fetch_or_relaxed - -#ifndef atomic_fetch_or_relaxed -#define atomic_fetch_or_acquire atomic_fetch_or -#define atomic_fetch_or_release atomic_fetch_or -#define atomic_fetch_or_relaxed atomic_fetch_or -#else /* atomic_fetch_or_relaxed */ - -#ifndef atomic_fetch_or_acquire -static __always_inline int -atomic_fetch_or_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_or_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_or_acquire atomic_fetch_or_acquire -#endif - -#ifndef atomic_fetch_or_release -static __always_inline int -atomic_fetch_or_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_or_relaxed(i, v); -} -#define atomic_fetch_or_release atomic_fetch_or_release -#endif - -#ifndef atomic_fetch_or -static __always_inline int -atomic_fetch_or(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_or_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_or atomic_fetch_or -#endif - -#endif /* atomic_fetch_or_relaxed */ - -#define arch_atomic_xor atomic_xor - -#define arch_atomic_fetch_xor atomic_fetch_xor -#define arch_atomic_fetch_xor_acquire atomic_fetch_xor_acquire -#define arch_atomic_fetch_xor_release atomic_fetch_xor_release -#define arch_atomic_fetch_xor_relaxed atomic_fetch_xor_relaxed - -#ifndef atomic_fetch_xor_relaxed -#define atomic_fetch_xor_acquire atomic_fetch_xor -#define atomic_fetch_xor_release atomic_fetch_xor -#define atomic_fetch_xor_relaxed atomic_fetch_xor -#else /* atomic_fetch_xor_relaxed */ - -#ifndef atomic_fetch_xor_acquire -static __always_inline int -atomic_fetch_xor_acquire(int i, atomic_t *v) -{ - int ret = atomic_fetch_xor_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic_fetch_xor_acquire atomic_fetch_xor_acquire -#endif - -#ifndef atomic_fetch_xor_release -static __always_inline int -atomic_fetch_xor_release(int i, atomic_t *v) -{ - __atomic_release_fence(); - return atomic_fetch_xor_relaxed(i, v); -} -#define atomic_fetch_xor_release atomic_fetch_xor_release -#endif - -#ifndef atomic_fetch_xor -static __always_inline int -atomic_fetch_xor(int i, atomic_t *v) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_fetch_xor_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic_fetch_xor atomic_fetch_xor -#endif - -#endif /* atomic_fetch_xor_relaxed */ - -#define arch_atomic_xchg atomic_xchg -#define arch_atomic_xchg_acquire atomic_xchg_acquire -#define arch_atomic_xchg_release atomic_xchg_release -#define arch_atomic_xchg_relaxed atomic_xchg_relaxed - -#ifndef atomic_xchg_relaxed -#define atomic_xchg_acquire atomic_xchg -#define atomic_xchg_release atomic_xchg -#define atomic_xchg_relaxed atomic_xchg -#else /* atomic_xchg_relaxed */ - -#ifndef atomic_xchg_acquire -static __always_inline int -atomic_xchg_acquire(atomic_t *v, int i) -{ - int ret = atomic_xchg_relaxed(v, i); - __atomic_acquire_fence(); - return ret; -} -#define atomic_xchg_acquire atomic_xchg_acquire -#endif - -#ifndef atomic_xchg_release -static __always_inline int -atomic_xchg_release(atomic_t *v, int i) -{ - __atomic_release_fence(); - return atomic_xchg_relaxed(v, i); -} -#define atomic_xchg_release atomic_xchg_release -#endif - -#ifndef atomic_xchg -static __always_inline int -atomic_xchg(atomic_t *v, int i) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_xchg_relaxed(v, i); - __atomic_post_full_fence(); - return ret; -} -#define atomic_xchg atomic_xchg -#endif - -#endif /* atomic_xchg_relaxed */ - -#define arch_atomic_cmpxchg atomic_cmpxchg -#define arch_atomic_cmpxchg_acquire atomic_cmpxchg_acquire -#define arch_atomic_cmpxchg_release atomic_cmpxchg_release -#define arch_atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed - -#ifndef atomic_cmpxchg_relaxed -#define atomic_cmpxchg_acquire atomic_cmpxchg -#define atomic_cmpxchg_release atomic_cmpxchg -#define atomic_cmpxchg_relaxed atomic_cmpxchg -#else /* atomic_cmpxchg_relaxed */ - -#ifndef atomic_cmpxchg_acquire -static __always_inline int -atomic_cmpxchg_acquire(atomic_t *v, int old, int new) -{ - int ret = atomic_cmpxchg_relaxed(v, old, new); - __atomic_acquire_fence(); - return ret; -} -#define atomic_cmpxchg_acquire atomic_cmpxchg_acquire -#endif - -#ifndef atomic_cmpxchg_release -static __always_inline int -atomic_cmpxchg_release(atomic_t *v, int old, int new) -{ - __atomic_release_fence(); - return atomic_cmpxchg_relaxed(v, old, new); -} -#define atomic_cmpxchg_release atomic_cmpxchg_release -#endif - -#ifndef atomic_cmpxchg -static __always_inline int -atomic_cmpxchg(atomic_t *v, int old, int new) -{ - int ret; - __atomic_pre_full_fence(); - ret = atomic_cmpxchg_relaxed(v, old, new); - __atomic_post_full_fence(); - return ret; -} -#define atomic_cmpxchg atomic_cmpxchg -#endif - -#endif /* atomic_cmpxchg_relaxed */ - -#define arch_atomic_try_cmpxchg atomic_try_cmpxchg -#define arch_atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire -#define arch_atomic_try_cmpxchg_release atomic_try_cmpxchg_release -#define arch_atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed - -#ifndef atomic_try_cmpxchg_relaxed -#ifdef atomic_try_cmpxchg -#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg -#define atomic_try_cmpxchg_release atomic_try_cmpxchg -#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg -#endif /* atomic_try_cmpxchg */ - -#ifndef atomic_try_cmpxchg -static __always_inline bool -atomic_try_cmpxchg(atomic_t *v, int *old, int new) -{ - int r, o = *old; - r = atomic_cmpxchg(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic_try_cmpxchg atomic_try_cmpxchg -#endif - -#ifndef atomic_try_cmpxchg_acquire -static __always_inline bool -atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) -{ - int r, o = *old; - r = atomic_cmpxchg_acquire(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire -#endif - -#ifndef atomic_try_cmpxchg_release -static __always_inline bool -atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) -{ - int r, o = *old; - r = atomic_cmpxchg_release(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic_try_cmpxchg_release atomic_try_cmpxchg_release -#endif - -#ifndef atomic_try_cmpxchg_relaxed -static __always_inline bool -atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) -{ - int r, o = *old; - r = atomic_cmpxchg_relaxed(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed -#endif - -#else /* atomic_try_cmpxchg_relaxed */ - -#ifndef atomic_try_cmpxchg_acquire -static __always_inline bool -atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) -{ - bool ret = atomic_try_cmpxchg_relaxed(v, old, new); - __atomic_acquire_fence(); - return ret; -} -#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire -#endif - -#ifndef atomic_try_cmpxchg_release -static __always_inline bool -atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) -{ - __atomic_release_fence(); - return atomic_try_cmpxchg_relaxed(v, old, new); -} -#define atomic_try_cmpxchg_release atomic_try_cmpxchg_release -#endif - -#ifndef atomic_try_cmpxchg -static __always_inline bool -atomic_try_cmpxchg(atomic_t *v, int *old, int new) -{ - bool ret; - __atomic_pre_full_fence(); - ret = atomic_try_cmpxchg_relaxed(v, old, new); - __atomic_post_full_fence(); - return ret; -} -#define atomic_try_cmpxchg atomic_try_cmpxchg -#endif - -#endif /* atomic_try_cmpxchg_relaxed */ - -#define arch_atomic_sub_and_test atomic_sub_and_test - -#ifndef atomic_sub_and_test -/** - * atomic_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static __always_inline bool -atomic_sub_and_test(int i, atomic_t *v) -{ - return atomic_sub_return(i, v) == 0; -} -#define atomic_sub_and_test atomic_sub_and_test -#endif - -#define arch_atomic_dec_and_test atomic_dec_and_test - -#ifndef atomic_dec_and_test -/** - * atomic_dec_and_test - decrement and test - * @v: pointer of type atomic_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static __always_inline bool -atomic_dec_and_test(atomic_t *v) -{ - return atomic_dec_return(v) == 0; -} -#define atomic_dec_and_test atomic_dec_and_test -#endif - -#define arch_atomic_inc_and_test atomic_inc_and_test - -#ifndef atomic_inc_and_test -/** - * atomic_inc_and_test - increment and test - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static __always_inline bool -atomic_inc_and_test(atomic_t *v) -{ - return atomic_inc_return(v) == 0; -} -#define atomic_inc_and_test atomic_inc_and_test -#endif - -#define arch_atomic_add_negative atomic_add_negative - -#ifndef atomic_add_negative -/** - * atomic_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static __always_inline bool -atomic_add_negative(int i, atomic_t *v) -{ - return atomic_add_return(i, v) < 0; -} -#define atomic_add_negative atomic_add_negative -#endif - -#define arch_atomic_fetch_add_unless atomic_fetch_add_unless - -#ifndef atomic_fetch_add_unless -/** - * atomic_fetch_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns original value of @v - */ -static __always_inline int -atomic_fetch_add_unless(atomic_t *v, int a, int u) -{ - int c = atomic_read(v); - - do { - if (unlikely(c == u)) - break; - } while (!atomic_try_cmpxchg(v, &c, c + a)); - - return c; -} -#define atomic_fetch_add_unless atomic_fetch_add_unless -#endif - -#define arch_atomic_add_unless atomic_add_unless - -#ifndef atomic_add_unless -/** - * atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, if @v was not already @u. - * Returns true if the addition was done. - */ -static __always_inline bool -atomic_add_unless(atomic_t *v, int a, int u) -{ - return atomic_fetch_add_unless(v, a, u) != u; -} -#define atomic_add_unless atomic_add_unless -#endif - -#define arch_atomic_inc_not_zero atomic_inc_not_zero - -#ifndef atomic_inc_not_zero -/** - * atomic_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic_t - * - * Atomically increments @v by 1, if @v is non-zero. - * Returns true if the increment was done. - */ -static __always_inline bool -atomic_inc_not_zero(atomic_t *v) -{ - return atomic_add_unless(v, 1, 0); -} -#define atomic_inc_not_zero atomic_inc_not_zero -#endif - -#define arch_atomic_inc_unless_negative atomic_inc_unless_negative - -#ifndef atomic_inc_unless_negative -static __always_inline bool -atomic_inc_unless_negative(atomic_t *v) -{ - int c = atomic_read(v); - - do { - if (unlikely(c < 0)) - return false; - } while (!atomic_try_cmpxchg(v, &c, c + 1)); - - return true; -} -#define atomic_inc_unless_negative atomic_inc_unless_negative -#endif - -#define arch_atomic_dec_unless_positive atomic_dec_unless_positive - -#ifndef atomic_dec_unless_positive -static __always_inline bool -atomic_dec_unless_positive(atomic_t *v) -{ - int c = atomic_read(v); - - do { - if (unlikely(c > 0)) - return false; - } while (!atomic_try_cmpxchg(v, &c, c - 1)); - - return true; -} -#define atomic_dec_unless_positive atomic_dec_unless_positive -#endif - -#define arch_atomic_dec_if_positive atomic_dec_if_positive - -#ifndef atomic_dec_if_positive -static __always_inline int -atomic_dec_if_positive(atomic_t *v) -{ - int dec, c = atomic_read(v); - - do { - dec = c - 1; - if (unlikely(dec < 0)) - break; - } while (!atomic_try_cmpxchg(v, &c, dec)); - - return dec; -} -#define atomic_dec_if_positive atomic_dec_if_positive -#endif - -#ifdef CONFIG_GENERIC_ATOMIC64 -#include -#endif - -#define arch_atomic64_read atomic64_read -#define arch_atomic64_read_acquire atomic64_read_acquire - -#ifndef atomic64_read_acquire -static __always_inline s64 -atomic64_read_acquire(const atomic64_t *v) -{ - return smp_load_acquire(&(v)->counter); -} -#define atomic64_read_acquire atomic64_read_acquire -#endif - -#define arch_atomic64_set atomic64_set -#define arch_atomic64_set_release atomic64_set_release - -#ifndef atomic64_set_release -static __always_inline void -atomic64_set_release(atomic64_t *v, s64 i) -{ - smp_store_release(&(v)->counter, i); -} -#define atomic64_set_release atomic64_set_release -#endif - -#define arch_atomic64_add atomic64_add - -#define arch_atomic64_add_return atomic64_add_return -#define arch_atomic64_add_return_acquire atomic64_add_return_acquire -#define arch_atomic64_add_return_release atomic64_add_return_release -#define arch_atomic64_add_return_relaxed atomic64_add_return_relaxed - -#ifndef atomic64_add_return_relaxed -#define atomic64_add_return_acquire atomic64_add_return -#define atomic64_add_return_release atomic64_add_return -#define atomic64_add_return_relaxed atomic64_add_return -#else /* atomic64_add_return_relaxed */ - -#ifndef atomic64_add_return_acquire -static __always_inline s64 -atomic64_add_return_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_add_return_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_add_return_acquire atomic64_add_return_acquire -#endif - -#ifndef atomic64_add_return_release -static __always_inline s64 -atomic64_add_return_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_add_return_relaxed(i, v); -} -#define atomic64_add_return_release atomic64_add_return_release -#endif - -#ifndef atomic64_add_return -static __always_inline s64 -atomic64_add_return(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_add_return_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_add_return atomic64_add_return -#endif - -#endif /* atomic64_add_return_relaxed */ - -#define arch_atomic64_fetch_add atomic64_fetch_add -#define arch_atomic64_fetch_add_acquire atomic64_fetch_add_acquire -#define arch_atomic64_fetch_add_release atomic64_fetch_add_release -#define arch_atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed - -#ifndef atomic64_fetch_add_relaxed -#define atomic64_fetch_add_acquire atomic64_fetch_add -#define atomic64_fetch_add_release atomic64_fetch_add -#define atomic64_fetch_add_relaxed atomic64_fetch_add -#else /* atomic64_fetch_add_relaxed */ - -#ifndef atomic64_fetch_add_acquire -static __always_inline s64 -atomic64_fetch_add_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_add_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_add_acquire atomic64_fetch_add_acquire -#endif - -#ifndef atomic64_fetch_add_release -static __always_inline s64 -atomic64_fetch_add_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_add_relaxed(i, v); -} -#define atomic64_fetch_add_release atomic64_fetch_add_release -#endif - -#ifndef atomic64_fetch_add -static __always_inline s64 -atomic64_fetch_add(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_add_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_add atomic64_fetch_add -#endif - -#endif /* atomic64_fetch_add_relaxed */ - -#define arch_atomic64_sub atomic64_sub - -#define arch_atomic64_sub_return atomic64_sub_return -#define arch_atomic64_sub_return_acquire atomic64_sub_return_acquire -#define arch_atomic64_sub_return_release atomic64_sub_return_release -#define arch_atomic64_sub_return_relaxed atomic64_sub_return_relaxed - -#ifndef atomic64_sub_return_relaxed -#define atomic64_sub_return_acquire atomic64_sub_return -#define atomic64_sub_return_release atomic64_sub_return -#define atomic64_sub_return_relaxed atomic64_sub_return -#else /* atomic64_sub_return_relaxed */ - -#ifndef atomic64_sub_return_acquire -static __always_inline s64 -atomic64_sub_return_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_sub_return_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_sub_return_acquire atomic64_sub_return_acquire -#endif - -#ifndef atomic64_sub_return_release -static __always_inline s64 -atomic64_sub_return_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_sub_return_relaxed(i, v); -} -#define atomic64_sub_return_release atomic64_sub_return_release -#endif - -#ifndef atomic64_sub_return -static __always_inline s64 -atomic64_sub_return(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_sub_return_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_sub_return atomic64_sub_return -#endif - -#endif /* atomic64_sub_return_relaxed */ - -#define arch_atomic64_fetch_sub atomic64_fetch_sub -#define arch_atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire -#define arch_atomic64_fetch_sub_release atomic64_fetch_sub_release -#define arch_atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed - -#ifndef atomic64_fetch_sub_relaxed -#define atomic64_fetch_sub_acquire atomic64_fetch_sub -#define atomic64_fetch_sub_release atomic64_fetch_sub -#define atomic64_fetch_sub_relaxed atomic64_fetch_sub -#else /* atomic64_fetch_sub_relaxed */ - -#ifndef atomic64_fetch_sub_acquire -static __always_inline s64 -atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_sub_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire -#endif - -#ifndef atomic64_fetch_sub_release -static __always_inline s64 -atomic64_fetch_sub_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_sub_relaxed(i, v); -} -#define atomic64_fetch_sub_release atomic64_fetch_sub_release -#endif - -#ifndef atomic64_fetch_sub -static __always_inline s64 -atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_sub_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_sub atomic64_fetch_sub -#endif - -#endif /* atomic64_fetch_sub_relaxed */ - -#define arch_atomic64_inc atomic64_inc - -#ifndef atomic64_inc -static __always_inline void -atomic64_inc(atomic64_t *v) -{ - atomic64_add(1, v); -} -#define atomic64_inc atomic64_inc -#endif - -#define arch_atomic64_inc_return atomic64_inc_return -#define arch_atomic64_inc_return_acquire atomic64_inc_return_acquire -#define arch_atomic64_inc_return_release atomic64_inc_return_release -#define arch_atomic64_inc_return_relaxed atomic64_inc_return_relaxed - -#ifndef atomic64_inc_return_relaxed -#ifdef atomic64_inc_return -#define atomic64_inc_return_acquire atomic64_inc_return -#define atomic64_inc_return_release atomic64_inc_return -#define atomic64_inc_return_relaxed atomic64_inc_return -#endif /* atomic64_inc_return */ - -#ifndef atomic64_inc_return -static __always_inline s64 -atomic64_inc_return(atomic64_t *v) -{ - return atomic64_add_return(1, v); -} -#define atomic64_inc_return atomic64_inc_return -#endif - -#ifndef atomic64_inc_return_acquire -static __always_inline s64 -atomic64_inc_return_acquire(atomic64_t *v) -{ - return atomic64_add_return_acquire(1, v); -} -#define atomic64_inc_return_acquire atomic64_inc_return_acquire -#endif - -#ifndef atomic64_inc_return_release -static __always_inline s64 -atomic64_inc_return_release(atomic64_t *v) -{ - return atomic64_add_return_release(1, v); -} -#define atomic64_inc_return_release atomic64_inc_return_release -#endif - -#ifndef atomic64_inc_return_relaxed -static __always_inline s64 -atomic64_inc_return_relaxed(atomic64_t *v) -{ - return atomic64_add_return_relaxed(1, v); -} -#define atomic64_inc_return_relaxed atomic64_inc_return_relaxed -#endif - -#else /* atomic64_inc_return_relaxed */ - -#ifndef atomic64_inc_return_acquire -static __always_inline s64 -atomic64_inc_return_acquire(atomic64_t *v) -{ - s64 ret = atomic64_inc_return_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_inc_return_acquire atomic64_inc_return_acquire -#endif - -#ifndef atomic64_inc_return_release -static __always_inline s64 -atomic64_inc_return_release(atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_inc_return_relaxed(v); -} -#define atomic64_inc_return_release atomic64_inc_return_release -#endif - -#ifndef atomic64_inc_return -static __always_inline s64 -atomic64_inc_return(atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_inc_return_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_inc_return atomic64_inc_return -#endif - -#endif /* atomic64_inc_return_relaxed */ - -#define arch_atomic64_fetch_inc atomic64_fetch_inc -#define arch_atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire -#define arch_atomic64_fetch_inc_release atomic64_fetch_inc_release -#define arch_atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed - -#ifndef atomic64_fetch_inc_relaxed -#ifdef atomic64_fetch_inc -#define atomic64_fetch_inc_acquire atomic64_fetch_inc -#define atomic64_fetch_inc_release atomic64_fetch_inc -#define atomic64_fetch_inc_relaxed atomic64_fetch_inc -#endif /* atomic64_fetch_inc */ - -#ifndef atomic64_fetch_inc -static __always_inline s64 -atomic64_fetch_inc(atomic64_t *v) -{ - return atomic64_fetch_add(1, v); -} -#define atomic64_fetch_inc atomic64_fetch_inc -#endif - -#ifndef atomic64_fetch_inc_acquire -static __always_inline s64 -atomic64_fetch_inc_acquire(atomic64_t *v) -{ - return atomic64_fetch_add_acquire(1, v); -} -#define atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire -#endif - -#ifndef atomic64_fetch_inc_release -static __always_inline s64 -atomic64_fetch_inc_release(atomic64_t *v) -{ - return atomic64_fetch_add_release(1, v); -} -#define atomic64_fetch_inc_release atomic64_fetch_inc_release -#endif - -#ifndef atomic64_fetch_inc_relaxed -static __always_inline s64 -atomic64_fetch_inc_relaxed(atomic64_t *v) -{ - return atomic64_fetch_add_relaxed(1, v); -} -#define atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed -#endif - -#else /* atomic64_fetch_inc_relaxed */ - -#ifndef atomic64_fetch_inc_acquire -static __always_inline s64 -atomic64_fetch_inc_acquire(atomic64_t *v) -{ - s64 ret = atomic64_fetch_inc_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire -#endif - -#ifndef atomic64_fetch_inc_release -static __always_inline s64 -atomic64_fetch_inc_release(atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_inc_relaxed(v); -} -#define atomic64_fetch_inc_release atomic64_fetch_inc_release -#endif - -#ifndef atomic64_fetch_inc -static __always_inline s64 -atomic64_fetch_inc(atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_inc_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_inc atomic64_fetch_inc -#endif - -#endif /* atomic64_fetch_inc_relaxed */ - -#define arch_atomic64_dec atomic64_dec - -#ifndef atomic64_dec -static __always_inline void -atomic64_dec(atomic64_t *v) -{ - atomic64_sub(1, v); -} -#define atomic64_dec atomic64_dec -#endif - -#define arch_atomic64_dec_return atomic64_dec_return -#define arch_atomic64_dec_return_acquire atomic64_dec_return_acquire -#define arch_atomic64_dec_return_release atomic64_dec_return_release -#define arch_atomic64_dec_return_relaxed atomic64_dec_return_relaxed - -#ifndef atomic64_dec_return_relaxed -#ifdef atomic64_dec_return -#define atomic64_dec_return_acquire atomic64_dec_return -#define atomic64_dec_return_release atomic64_dec_return -#define atomic64_dec_return_relaxed atomic64_dec_return -#endif /* atomic64_dec_return */ - -#ifndef atomic64_dec_return -static __always_inline s64 -atomic64_dec_return(atomic64_t *v) -{ - return atomic64_sub_return(1, v); -} -#define atomic64_dec_return atomic64_dec_return -#endif - -#ifndef atomic64_dec_return_acquire -static __always_inline s64 -atomic64_dec_return_acquire(atomic64_t *v) -{ - return atomic64_sub_return_acquire(1, v); -} -#define atomic64_dec_return_acquire atomic64_dec_return_acquire -#endif - -#ifndef atomic64_dec_return_release -static __always_inline s64 -atomic64_dec_return_release(atomic64_t *v) -{ - return atomic64_sub_return_release(1, v); -} -#define atomic64_dec_return_release atomic64_dec_return_release -#endif - -#ifndef atomic64_dec_return_relaxed -static __always_inline s64 -atomic64_dec_return_relaxed(atomic64_t *v) -{ - return atomic64_sub_return_relaxed(1, v); -} -#define atomic64_dec_return_relaxed atomic64_dec_return_relaxed -#endif - -#else /* atomic64_dec_return_relaxed */ - -#ifndef atomic64_dec_return_acquire -static __always_inline s64 -atomic64_dec_return_acquire(atomic64_t *v) -{ - s64 ret = atomic64_dec_return_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_dec_return_acquire atomic64_dec_return_acquire -#endif - -#ifndef atomic64_dec_return_release -static __always_inline s64 -atomic64_dec_return_release(atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_dec_return_relaxed(v); -} -#define atomic64_dec_return_release atomic64_dec_return_release -#endif - -#ifndef atomic64_dec_return -static __always_inline s64 -atomic64_dec_return(atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_dec_return_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_dec_return atomic64_dec_return -#endif - -#endif /* atomic64_dec_return_relaxed */ - -#define arch_atomic64_fetch_dec atomic64_fetch_dec -#define arch_atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire -#define arch_atomic64_fetch_dec_release atomic64_fetch_dec_release -#define arch_atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed - -#ifndef atomic64_fetch_dec_relaxed -#ifdef atomic64_fetch_dec -#define atomic64_fetch_dec_acquire atomic64_fetch_dec -#define atomic64_fetch_dec_release atomic64_fetch_dec -#define atomic64_fetch_dec_relaxed atomic64_fetch_dec -#endif /* atomic64_fetch_dec */ - -#ifndef atomic64_fetch_dec -static __always_inline s64 -atomic64_fetch_dec(atomic64_t *v) -{ - return atomic64_fetch_sub(1, v); -} -#define atomic64_fetch_dec atomic64_fetch_dec -#endif - -#ifndef atomic64_fetch_dec_acquire -static __always_inline s64 -atomic64_fetch_dec_acquire(atomic64_t *v) -{ - return atomic64_fetch_sub_acquire(1, v); -} -#define atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire -#endif - -#ifndef atomic64_fetch_dec_release -static __always_inline s64 -atomic64_fetch_dec_release(atomic64_t *v) -{ - return atomic64_fetch_sub_release(1, v); -} -#define atomic64_fetch_dec_release atomic64_fetch_dec_release -#endif - -#ifndef atomic64_fetch_dec_relaxed -static __always_inline s64 -atomic64_fetch_dec_relaxed(atomic64_t *v) -{ - return atomic64_fetch_sub_relaxed(1, v); -} -#define atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed -#endif - -#else /* atomic64_fetch_dec_relaxed */ - -#ifndef atomic64_fetch_dec_acquire -static __always_inline s64 -atomic64_fetch_dec_acquire(atomic64_t *v) -{ - s64 ret = atomic64_fetch_dec_relaxed(v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire -#endif - -#ifndef atomic64_fetch_dec_release -static __always_inline s64 -atomic64_fetch_dec_release(atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_dec_relaxed(v); -} -#define atomic64_fetch_dec_release atomic64_fetch_dec_release -#endif - -#ifndef atomic64_fetch_dec -static __always_inline s64 -atomic64_fetch_dec(atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_dec_relaxed(v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_dec atomic64_fetch_dec -#endif - -#endif /* atomic64_fetch_dec_relaxed */ - -#define arch_atomic64_and atomic64_and - -#define arch_atomic64_fetch_and atomic64_fetch_and -#define arch_atomic64_fetch_and_acquire atomic64_fetch_and_acquire -#define arch_atomic64_fetch_and_release atomic64_fetch_and_release -#define arch_atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed - -#ifndef atomic64_fetch_and_relaxed -#define atomic64_fetch_and_acquire atomic64_fetch_and -#define atomic64_fetch_and_release atomic64_fetch_and -#define atomic64_fetch_and_relaxed atomic64_fetch_and -#else /* atomic64_fetch_and_relaxed */ - -#ifndef atomic64_fetch_and_acquire -static __always_inline s64 -atomic64_fetch_and_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_and_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_and_acquire atomic64_fetch_and_acquire -#endif - -#ifndef atomic64_fetch_and_release -static __always_inline s64 -atomic64_fetch_and_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_and_relaxed(i, v); -} -#define atomic64_fetch_and_release atomic64_fetch_and_release -#endif - -#ifndef atomic64_fetch_and -static __always_inline s64 -atomic64_fetch_and(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_and_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_and atomic64_fetch_and -#endif - -#endif /* atomic64_fetch_and_relaxed */ - -#define arch_atomic64_andnot atomic64_andnot - -#ifndef atomic64_andnot -static __always_inline void -atomic64_andnot(s64 i, atomic64_t *v) -{ - atomic64_and(~i, v); -} -#define atomic64_andnot atomic64_andnot -#endif - -#define arch_atomic64_fetch_andnot atomic64_fetch_andnot -#define arch_atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire -#define arch_atomic64_fetch_andnot_release atomic64_fetch_andnot_release -#define arch_atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed - -#ifndef atomic64_fetch_andnot_relaxed -#ifdef atomic64_fetch_andnot -#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot -#define atomic64_fetch_andnot_release atomic64_fetch_andnot -#define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot -#endif /* atomic64_fetch_andnot */ - -#ifndef atomic64_fetch_andnot -static __always_inline s64 -atomic64_fetch_andnot(s64 i, atomic64_t *v) -{ - return atomic64_fetch_and(~i, v); -} -#define atomic64_fetch_andnot atomic64_fetch_andnot -#endif - -#ifndef atomic64_fetch_andnot_acquire -static __always_inline s64 -atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) -{ - return atomic64_fetch_and_acquire(~i, v); -} -#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire -#endif - -#ifndef atomic64_fetch_andnot_release -static __always_inline s64 -atomic64_fetch_andnot_release(s64 i, atomic64_t *v) -{ - return atomic64_fetch_and_release(~i, v); -} -#define atomic64_fetch_andnot_release atomic64_fetch_andnot_release -#endif - -#ifndef atomic64_fetch_andnot_relaxed -static __always_inline s64 -atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) -{ - return atomic64_fetch_and_relaxed(~i, v); -} -#define atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed -#endif - -#else /* atomic64_fetch_andnot_relaxed */ - -#ifndef atomic64_fetch_andnot_acquire -static __always_inline s64 -atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_andnot_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire -#endif - -#ifndef atomic64_fetch_andnot_release -static __always_inline s64 -atomic64_fetch_andnot_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_andnot_relaxed(i, v); -} -#define atomic64_fetch_andnot_release atomic64_fetch_andnot_release -#endif - -#ifndef atomic64_fetch_andnot -static __always_inline s64 -atomic64_fetch_andnot(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_andnot_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_andnot atomic64_fetch_andnot -#endif - -#endif /* atomic64_fetch_andnot_relaxed */ - -#define arch_atomic64_or atomic64_or - -#define arch_atomic64_fetch_or atomic64_fetch_or -#define arch_atomic64_fetch_or_acquire atomic64_fetch_or_acquire -#define arch_atomic64_fetch_or_release atomic64_fetch_or_release -#define arch_atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed - -#ifndef atomic64_fetch_or_relaxed -#define atomic64_fetch_or_acquire atomic64_fetch_or -#define atomic64_fetch_or_release atomic64_fetch_or -#define atomic64_fetch_or_relaxed atomic64_fetch_or -#else /* atomic64_fetch_or_relaxed */ - -#ifndef atomic64_fetch_or_acquire -static __always_inline s64 -atomic64_fetch_or_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_or_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_or_acquire atomic64_fetch_or_acquire -#endif - -#ifndef atomic64_fetch_or_release -static __always_inline s64 -atomic64_fetch_or_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_or_relaxed(i, v); -} -#define atomic64_fetch_or_release atomic64_fetch_or_release -#endif - -#ifndef atomic64_fetch_or -static __always_inline s64 -atomic64_fetch_or(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_or_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_or atomic64_fetch_or -#endif - -#endif /* atomic64_fetch_or_relaxed */ - -#define arch_atomic64_xor atomic64_xor - -#define arch_atomic64_fetch_xor atomic64_fetch_xor -#define arch_atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire -#define arch_atomic64_fetch_xor_release atomic64_fetch_xor_release -#define arch_atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed - -#ifndef atomic64_fetch_xor_relaxed -#define atomic64_fetch_xor_acquire atomic64_fetch_xor -#define atomic64_fetch_xor_release atomic64_fetch_xor -#define atomic64_fetch_xor_relaxed atomic64_fetch_xor -#else /* atomic64_fetch_xor_relaxed */ - -#ifndef atomic64_fetch_xor_acquire -static __always_inline s64 -atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) -{ - s64 ret = atomic64_fetch_xor_relaxed(i, v); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire -#endif - -#ifndef atomic64_fetch_xor_release -static __always_inline s64 -atomic64_fetch_xor_release(s64 i, atomic64_t *v) -{ - __atomic_release_fence(); - return atomic64_fetch_xor_relaxed(i, v); -} -#define atomic64_fetch_xor_release atomic64_fetch_xor_release -#endif - -#ifndef atomic64_fetch_xor -static __always_inline s64 -atomic64_fetch_xor(s64 i, atomic64_t *v) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_fetch_xor_relaxed(i, v); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_fetch_xor atomic64_fetch_xor -#endif - -#endif /* atomic64_fetch_xor_relaxed */ - -#define arch_atomic64_xchg atomic64_xchg -#define arch_atomic64_xchg_acquire atomic64_xchg_acquire -#define arch_atomic64_xchg_release atomic64_xchg_release -#define arch_atomic64_xchg_relaxed atomic64_xchg_relaxed - -#ifndef atomic64_xchg_relaxed -#define atomic64_xchg_acquire atomic64_xchg -#define atomic64_xchg_release atomic64_xchg -#define atomic64_xchg_relaxed atomic64_xchg -#else /* atomic64_xchg_relaxed */ - -#ifndef atomic64_xchg_acquire -static __always_inline s64 -atomic64_xchg_acquire(atomic64_t *v, s64 i) -{ - s64 ret = atomic64_xchg_relaxed(v, i); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_xchg_acquire atomic64_xchg_acquire -#endif - -#ifndef atomic64_xchg_release -static __always_inline s64 -atomic64_xchg_release(atomic64_t *v, s64 i) -{ - __atomic_release_fence(); - return atomic64_xchg_relaxed(v, i); -} -#define atomic64_xchg_release atomic64_xchg_release -#endif - -#ifndef atomic64_xchg -static __always_inline s64 -atomic64_xchg(atomic64_t *v, s64 i) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_xchg_relaxed(v, i); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_xchg atomic64_xchg -#endif - -#endif /* atomic64_xchg_relaxed */ - -#define arch_atomic64_cmpxchg atomic64_cmpxchg -#define arch_atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire -#define arch_atomic64_cmpxchg_release atomic64_cmpxchg_release -#define arch_atomic64_cmpxchg_relaxed atomic64_cmpxchg_relaxed - -#ifndef atomic64_cmpxchg_relaxed -#define atomic64_cmpxchg_acquire atomic64_cmpxchg -#define atomic64_cmpxchg_release atomic64_cmpxchg -#define atomic64_cmpxchg_relaxed atomic64_cmpxchg -#else /* atomic64_cmpxchg_relaxed */ - -#ifndef atomic64_cmpxchg_acquire -static __always_inline s64 -atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) -{ - s64 ret = atomic64_cmpxchg_relaxed(v, old, new); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire -#endif - -#ifndef atomic64_cmpxchg_release -static __always_inline s64 -atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) -{ - __atomic_release_fence(); - return atomic64_cmpxchg_relaxed(v, old, new); -} -#define atomic64_cmpxchg_release atomic64_cmpxchg_release -#endif - -#ifndef atomic64_cmpxchg -static __always_inline s64 -atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) -{ - s64 ret; - __atomic_pre_full_fence(); - ret = atomic64_cmpxchg_relaxed(v, old, new); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_cmpxchg atomic64_cmpxchg -#endif - -#endif /* atomic64_cmpxchg_relaxed */ - -#define arch_atomic64_try_cmpxchg atomic64_try_cmpxchg -#define arch_atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire -#define arch_atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release -#define arch_atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed - -#ifndef atomic64_try_cmpxchg_relaxed -#ifdef atomic64_try_cmpxchg -#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg -#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg -#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg -#endif /* atomic64_try_cmpxchg */ - -#ifndef atomic64_try_cmpxchg -static __always_inline bool -atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) -{ - s64 r, o = *old; - r = atomic64_cmpxchg(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic64_try_cmpxchg atomic64_try_cmpxchg -#endif - -#ifndef atomic64_try_cmpxchg_acquire -static __always_inline bool -atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) -{ - s64 r, o = *old; - r = atomic64_cmpxchg_acquire(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire -#endif - -#ifndef atomic64_try_cmpxchg_release -static __always_inline bool -atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) -{ - s64 r, o = *old; - r = atomic64_cmpxchg_release(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release -#endif - -#ifndef atomic64_try_cmpxchg_relaxed -static __always_inline bool -atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) -{ - s64 r, o = *old; - r = atomic64_cmpxchg_relaxed(v, o, new); - if (unlikely(r != o)) - *old = r; - return likely(r == o); -} -#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed -#endif - -#else /* atomic64_try_cmpxchg_relaxed */ - -#ifndef atomic64_try_cmpxchg_acquire -static __always_inline bool -atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) -{ - bool ret = atomic64_try_cmpxchg_relaxed(v, old, new); - __atomic_acquire_fence(); - return ret; -} -#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire -#endif - -#ifndef atomic64_try_cmpxchg_release -static __always_inline bool -atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) -{ - __atomic_release_fence(); - return atomic64_try_cmpxchg_relaxed(v, old, new); -} -#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release -#endif - -#ifndef atomic64_try_cmpxchg -static __always_inline bool -atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) -{ - bool ret; - __atomic_pre_full_fence(); - ret = atomic64_try_cmpxchg_relaxed(v, old, new); - __atomic_post_full_fence(); - return ret; -} -#define atomic64_try_cmpxchg atomic64_try_cmpxchg -#endif - -#endif /* atomic64_try_cmpxchg_relaxed */ - -#define arch_atomic64_sub_and_test atomic64_sub_and_test - -#ifndef atomic64_sub_and_test -/** - * atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer of type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static __always_inline bool -atomic64_sub_and_test(s64 i, atomic64_t *v) -{ - return atomic64_sub_return(i, v) == 0; -} -#define atomic64_sub_and_test atomic64_sub_and_test -#endif - -#define arch_atomic64_dec_and_test atomic64_dec_and_test - -#ifndef atomic64_dec_and_test -/** - * atomic64_dec_and_test - decrement and test - * @v: pointer of type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static __always_inline bool -atomic64_dec_and_test(atomic64_t *v) -{ - return atomic64_dec_return(v) == 0; -} -#define atomic64_dec_and_test atomic64_dec_and_test -#endif - -#define arch_atomic64_inc_and_test atomic64_inc_and_test - -#ifndef atomic64_inc_and_test -/** - * atomic64_inc_and_test - increment and test - * @v: pointer of type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static __always_inline bool -atomic64_inc_and_test(atomic64_t *v) -{ - return atomic64_inc_return(v) == 0; -} -#define atomic64_inc_and_test atomic64_inc_and_test -#endif - -#define arch_atomic64_add_negative atomic64_add_negative - -#ifndef atomic64_add_negative -/** - * atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer of type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static __always_inline bool -atomic64_add_negative(s64 i, atomic64_t *v) -{ - return atomic64_add_return(i, v) < 0; -} -#define atomic64_add_negative atomic64_add_negative -#endif - -#define arch_atomic64_fetch_add_unless atomic64_fetch_add_unless - -#ifndef atomic64_fetch_add_unless -/** - * atomic64_fetch_add_unless - add unless the number is already a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns original value of @v - */ -static __always_inline s64 -atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) -{ - s64 c = atomic64_read(v); - - do { - if (unlikely(c == u)) - break; - } while (!atomic64_try_cmpxchg(v, &c, c + a)); - - return c; -} -#define atomic64_fetch_add_unless atomic64_fetch_add_unless -#endif - -#define arch_atomic64_add_unless atomic64_add_unless - -#ifndef atomic64_add_unless -/** - * atomic64_add_unless - add unless the number is already a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, if @v was not already @u. - * Returns true if the addition was done. - */ -static __always_inline bool -atomic64_add_unless(atomic64_t *v, s64 a, s64 u) -{ - return atomic64_fetch_add_unless(v, a, u) != u; -} -#define atomic64_add_unless atomic64_add_unless -#endif - -#define arch_atomic64_inc_not_zero atomic64_inc_not_zero - -#ifndef atomic64_inc_not_zero -/** - * atomic64_inc_not_zero - increment unless the number is zero - * @v: pointer of type atomic64_t - * - * Atomically increments @v by 1, if @v is non-zero. - * Returns true if the increment was done. - */ -static __always_inline bool -atomic64_inc_not_zero(atomic64_t *v) -{ - return atomic64_add_unless(v, 1, 0); -} -#define atomic64_inc_not_zero atomic64_inc_not_zero -#endif - -#define arch_atomic64_inc_unless_negative atomic64_inc_unless_negative - -#ifndef atomic64_inc_unless_negative -static __always_inline bool -atomic64_inc_unless_negative(atomic64_t *v) -{ - s64 c = atomic64_read(v); - - do { - if (unlikely(c < 0)) - return false; - } while (!atomic64_try_cmpxchg(v, &c, c + 1)); - - return true; -} -#define atomic64_inc_unless_negative atomic64_inc_unless_negative -#endif - -#define arch_atomic64_dec_unless_positive atomic64_dec_unless_positive - -#ifndef atomic64_dec_unless_positive -static __always_inline bool -atomic64_dec_unless_positive(atomic64_t *v) -{ - s64 c = atomic64_read(v); - - do { - if (unlikely(c > 0)) - return false; - } while (!atomic64_try_cmpxchg(v, &c, c - 1)); - - return true; -} -#define atomic64_dec_unless_positive atomic64_dec_unless_positive -#endif - -#define arch_atomic64_dec_if_positive atomic64_dec_if_positive - -#ifndef atomic64_dec_if_positive -static __always_inline s64 -atomic64_dec_if_positive(atomic64_t *v) -{ - s64 dec, c = atomic64_read(v); - - do { - dec = c - 1; - if (unlikely(dec < 0)) - break; - } while (!atomic64_try_cmpxchg(v, &c, dec)); - - return dec; -} -#define atomic64_dec_if_positive atomic64_dec_if_positive -#endif - -#endif /* _LINUX_ATOMIC_FALLBACK_H */ -// d78e6c293c661c15188f0ec05bce45188c8d5892 diff --git a/include/linux/atomic.h b/include/linux/atomic.h index 4f8d83f9e480..ed1d3ffd5b9d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -77,12 +77,8 @@ __ret; \ }) -#ifdef CONFIG_ARCH_ATOMIC #include #include -#else -#include -#endif #include diff --git a/scripts/atomic/check-atomics.sh b/scripts/atomic/check-atomics.sh index 82748d42ecc5..9c7fbd4bcbce 100755 --- a/scripts/atomic/check-atomics.sh +++ b/scripts/atomic/check-atomics.sh @@ -17,7 +17,6 @@ cat < ${LINUXDIR}/include/${header} -- cgit v1.2.3 From d92cc4d5164398cc6d191084b46e622976c0ba89 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 12 May 2021 16:57:25 +0900 Subject: kbuild: require all architectures to have arch/$(SRCARCH)/Kbuild arch/$(SRCARCH)/Kbuild is useful for Makefile cleanups because you can use the obj-y syntax. Add an empty file if it is missing in arch/$(SRCARCH)/. Signed-off-by: Masahiro Yamada --- Makefile | 2 +- arch/alpha/Kbuild | 1 + arch/arc/Makefile | 3 --- arch/arm/Makefile | 1 - arch/arm64/Makefile | 1 - arch/csky/Kbuild | 1 + arch/h8300/Kbuild | 1 + arch/hexagon/Kbuild | 1 + arch/ia64/Kbuild | 1 + arch/m68k/Makefile | 1 - arch/microblaze/Kbuild | 1 + arch/mips/Makefile | 3 --- arch/nds32/Kbuild | 1 + arch/nios2/Kbuild | 1 + arch/openrisc/Makefile | 1 - arch/parisc/Kbuild | 1 + arch/powerpc/Makefile | 3 --- arch/riscv/Makefile | 1 - arch/s390/Makefile | 3 --- arch/sh/Kbuild | 1 + arch/sparc/Makefile | 3 --- arch/um/Kbuild | 1 + arch/x86/Makefile | 3 --- arch/xtensa/Kbuild | 1 + 24 files changed, 13 insertions(+), 24 deletions(-) create mode 100644 arch/alpha/Kbuild create mode 100644 arch/csky/Kbuild create mode 100644 arch/h8300/Kbuild create mode 100644 arch/hexagon/Kbuild create mode 100644 arch/ia64/Kbuild create mode 100644 arch/microblaze/Kbuild create mode 100644 arch/nds32/Kbuild create mode 100644 arch/nios2/Kbuild create mode 100644 arch/parisc/Kbuild create mode 100644 arch/sh/Kbuild create mode 100644 arch/um/Kbuild create mode 100644 arch/xtensa/Kbuild (limited to 'arch/x86') diff --git a/Makefile b/Makefile index b0bd0ce0aab1..4dcfe9c48d60 100644 --- a/Makefile +++ b/Makefile @@ -658,7 +658,7 @@ endif ifeq ($(KBUILD_EXTMOD),) # Objects we will link into vmlinux / subdirs we need to visit -core-y := init/ usr/ +core-y := init/ usr/ arch/$(SRCARCH)/ drivers-y := drivers/ sound/ drivers-$(CONFIG_SAMPLES) += samples/ drivers-$(CONFIG_NET) += net/ diff --git a/arch/alpha/Kbuild b/arch/alpha/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/alpha/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/arc/Makefile b/arch/arc/Makefile index e47adc97a89b..c0d87ac2e221 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -85,9 +85,6 @@ KBUILD_LDFLAGS += $(ldflags-y) head-y := arch/arc/kernel/head.o -# See arch/arc/Kbuild for content of core part of the kernel -core-y += arch/arc/ - # w/o this dtb won't embed into kernel binary core-y += arch/arc/boot/dts/ diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 415c3514573a..173da685a52e 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -252,7 +252,6 @@ endif export TEXT_OFFSET GZFLAGS MMUEXT -core-y += arch/arm/ # If we have a machine-specific directory, then include it in the build. core-y += $(machdirs) $(platdirs) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index b52481f0605d..be87c656dda3 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -149,7 +149,6 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) -core-y += arch/arm64/ libs-y := arch/arm64/lib/ $(libs-y) libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/csky/Kbuild b/arch/csky/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/csky/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/h8300/Kbuild b/arch/h8300/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/h8300/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/hexagon/Kbuild b/arch/hexagon/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/hexagon/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/ia64/Kbuild b/arch/ia64/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/ia64/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index 82620f14124d..c7710dbbcc49 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -98,7 +98,6 @@ head-$(CONFIG_SUN3) := arch/m68k/kernel/sun3-head.o head-$(CONFIG_M68000) := arch/m68k/68000/head.o head-$(CONFIG_COLDFIRE) := arch/m68k/coldfire/head.o -core-y += arch/m68k/ libs-y += arch/m68k/lib/ diff --git a/arch/microblaze/Kbuild b/arch/microblaze/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/microblaze/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 258234c35a09..4e942b7ef022 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -332,9 +332,6 @@ head-y := arch/mips/kernel/head.o libs-y += arch/mips/lib/ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/mips/math-emu/ -# See arch/mips/Kbuild for content of core part of the kernel -core-y += arch/mips/ - drivers-y += arch/mips/crypto/ # suspend and hibernation support diff --git a/arch/nds32/Kbuild b/arch/nds32/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/nds32/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/nios2/Kbuild b/arch/nios2/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/nios2/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/openrisc/Makefile b/arch/openrisc/Makefile index 410e7abfac69..c52de526e518 100644 --- a/arch/openrisc/Makefile +++ b/arch/openrisc/Makefile @@ -42,7 +42,6 @@ endif head-y := arch/openrisc/kernel/head.o -core-y += arch/openrisc/ libs-y += $(LIBGCC) PHONY += vmlinux.bin diff --git a/arch/parisc/Kbuild b/arch/parisc/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/parisc/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 3212d076ac6a..af669aa75b73 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -267,9 +267,6 @@ head-$(CONFIG_PPC_FPU) += arch/powerpc/kernel/fpu.o head-$(CONFIG_ALTIVEC) += arch/powerpc/kernel/vector.o head-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += arch/powerpc/kernel/prom_init.o -# See arch/powerpc/Kbuild for content of core part of the kernel -core-y += arch/powerpc/ - # Default to zImage, override when needed all: zImage diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 3eb9590a0775..c5f359540862 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -90,7 +90,6 @@ endif head-y := arch/riscv/kernel/head.o -core-y += arch/riscv/ core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/ libs-y += arch/riscv/lib/ diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e443ed9947bd..37b61645694c 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -128,9 +128,6 @@ OBJCOPYFLAGS := -O binary head-y := arch/s390/kernel/head64.o -# See arch/s390/Kbuild for content of core part of the kernel -core-y += arch/s390/ - libs-y += arch/s390/lib/ drivers-y += drivers/s390/ diff --git a/arch/sh/Kbuild b/arch/sh/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/sh/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index bee99e65fe23..4e65245bc755 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -58,9 +58,6 @@ endif head-y := arch/sparc/kernel/head_$(BITS).o -# See arch/sparc/Kbuild for the core part of the kernel -core-y += arch/sparc/ - libs-y += arch/sparc/prom/ libs-y += arch/sparc/lib/ diff --git a/arch/um/Kbuild b/arch/um/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/um/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 307529417021..d181a19ef556 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -239,9 +239,6 @@ head-y += arch/x86/kernel/platform-quirks.o libs-y += arch/x86/lib/ -# See arch/x86/Kbuild for content of core part of the kernel -core-y += arch/x86/ - # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ drivers-$(CONFIG_PCI) += arch/x86/pci/ diff --git a/arch/xtensa/Kbuild b/arch/xtensa/Kbuild new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/xtensa/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only -- cgit v1.2.3 From 72b268a8e9307a1757f61af080e990b5baa11d2a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:32 -0700 Subject: KVM: X86: Bail out of direct yield in case of under-committed scenarios In case of under-committed scenarios, vCPUs can be scheduled easily; kvm_vcpu_yield_to adds extra overhead, and it is also common to see when vcpu->ready is true but yield later failing due to p->state is TASK_RUNNING. Let's bail out in such scenarios by checking the length of current cpu runqueue, which can be treated as a hint of under-committed instead of guarantee of accuracy. 30%+ of directed-yield attempts can now avoid the expensive lookups in kvm_sched_yield() in an under-committed scenario. Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-2-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9b6bca616929..dfb7c320581f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8360,6 +8360,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) vcpu->stat.directed_yield_attempted++; + if (single_task_running()) + goto no_yield; + rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); -- cgit v1.2.3 From 1eff0ada88b48e4ac1e3fe26483b3684fedecd27 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:33 -0700 Subject: KVM: X86: Fix vCPU preempted state from guest's point of view Commit 66570e966dd9 (kvm: x86: only provide PV features if enabled in guest's CPUID) avoids to access pv tlb shootdown host side logic when this pv feature is not exposed to guest, however, kvm_steal_time.preempted not only leveraged by pv tlb shootdown logic but also mitigate the lock holder preemption issue. From guest's point of view, vCPU is always preempted since we lose the reset of kvm_steal_time.preempted before vmentry if pv tlb shootdown feature is not exposed. This patch fixes it by clearing kvm_steal_time.preempted before vmentry. Fixes: 66570e966dd9 (kvm: x86: only provide PV features if enabled in guest's CPUID) Reviewed-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-3-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfb7c320581f..bed7b5348c0e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3105,6 +3105,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + } else { + st->preempted = 0; } vcpu->arch.st.preempted = 0; -- cgit v1.2.3 From da6d63a0062a3ee721b84123b83ec093f25759b0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:34 -0700 Subject: KVM: X86: hyper-v: Task srcu lock when accessing kvm_memslots() WARNING: suspicious RCU usage 5.13.0-rc1 #4 Not tainted ----------------------------- ./include/linux/kvm_host.h:710 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by hyperv_clock/8318: #0: ffffb6b8cb05a7d8 (&hv->hv_lock){+.+.}-{3:3}, at: kvm_hv_invalidate_tsc_page+0x3e/0xa0 [kvm] stack backtrace: CPU: 3 PID: 8318 Comm: hyperv_clock Not tainted 5.13.0-rc1 #4 Call Trace: dump_stack+0x87/0xb7 lockdep_rcu_suspicious+0xce/0xf0 kvm_write_guest_page+0x1c1/0x1d0 [kvm] kvm_write_guest+0x50/0x90 [kvm] kvm_hv_invalidate_tsc_page+0x79/0xa0 [kvm] kvm_gen_update_masterclock+0x1d/0x110 [kvm] kvm_arch_vm_ioctl+0x2a7/0xc50 [kvm] kvm_vm_ioctl+0x123/0x11d0 [kvm] __x64_sys_ioctl+0x3ed/0x9d0 do_syscall_64+0x3d/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae kvm_memslots() will be called by kvm_write_guest(), so we should take the srcu lock. Fixes: e880c6ea5 (KVM: x86: hyper-v: Prevent using not-yet-updated TSC page by secondary CPUs) Reviewed-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-4-git-send-email-wanpengli@tencent.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f98370a39936..f00830e5202f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1172,6 +1172,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); u64 gfn; + int idx; if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET || @@ -1190,9 +1191,16 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm) gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; hv->tsc_ref.tsc_sequence = 0; + + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&kvm->srcu); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; + srcu_read_unlock(&kvm->srcu, idx); out_unlock: mutex_unlock(&hv->hv_lock); -- cgit v1.2.3 From 9805cf03fdb6828091fe09e4ef0fb544fca3eaf6 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 18 May 2021 05:00:35 -0700 Subject: KVM: LAPIC: Narrow the timer latency between wait_lapic_expire and world switch Let's treat lapic_timer_advance_ns automatic tuning logic as hypervisor overhead, move it before wait_lapic_expire instead of between wait_lapic_expire and the world switch, the wait duration should be calculated by the up-to-date guest_tsc after the overhead of automatic tuning logic. This patch reduces ~30+ cycles for kvm-unit-tests/tscdeadline-latency when testing busy waits. Signed-off-by: Wanpeng Li Message-Id: <1621339235-11131-5-git-send-email-wanpengli@tencent.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c0ebef560bd1..5d91f2367c31 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1598,11 +1598,19 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); apic->lapic_timer.advance_expire_delta = guest_tsc - tsc_deadline; + if (lapic_timer_advance_dynamic) { + adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); + /* + * If the timer fired early, reread the TSC to account for the + * overhead of the above adjustment to avoid waiting longer + * than is necessary. + */ + if (guest_tsc < tsc_deadline) + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + } + if (guest_tsc < tsc_deadline) __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - - if (lapic_timer_advance_dynamic) - adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 57ab87947abfc4e0b0b9864dc4717326a1c28a39 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 May 2021 10:41:16 -0300 Subject: KVM: x86: add start_assignment hook to kvm_x86_ops Add a start_assignment hook to kvm_x86_ops, which is called when kvm_arch_start_assignment is done. The hook is required to update the wakeup vector of a sleeping vCPU when a device is assigned to the guest. Signed-off-by: Marcelo Tosatti Message-Id: <20210525134321.254128742@redhat.com> Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 323641097f63..e7bef91cee04 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,6 +99,7 @@ KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) +KVM_X86_OP_NULL(start_assignment) KVM_X86_OP_NULL(apicv_post_state_restore) KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt) KVM_X86_OP_NULL(set_hv_timer) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 55efbacfc244..9c7ced0e3171 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1352,6 +1352,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*start_assignment)(struct kvm *kvm); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bed7b5348c0e..98538b1cb453 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11504,7 +11504,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) void kvm_arch_start_assignment(struct kvm *kvm) { - atomic_inc(&kvm->arch.assigned_device_count); + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) + static_call_cond(kvm_x86_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); -- cgit v1.2.3 From 084071d5e9226add45a6031928bf10e6afc855fd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 25 May 2021 10:41:17 -0300 Subject: KVM: rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK KVM_REQ_UNBLOCK will be used to exit a vcpu from its inner vcpu halt emulation loop. Rename KVM_REQ_PENDING_TIMER to KVM_REQ_UNBLOCK, switch PowerPC to arch specific request bit. Signed-off-by: Marcelo Tosatti Message-Id: <20210525134321.303768132@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/vcpu-requests.rst | 8 +++++--- arch/powerpc/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 ++ 6 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst index 5feb3706a7ae..af1b37441e0a 100644 --- a/Documentation/virt/kvm/vcpu-requests.rst +++ b/Documentation/virt/kvm/vcpu-requests.rst @@ -118,10 +118,12 @@ KVM_REQ_MMU_RELOAD necessary to inform each VCPU to completely refresh the tables. This request is used for that. -KVM_REQ_PENDING_TIMER +KVM_REQ_UNBLOCK - This request may be made from a timer handler run on the host on behalf - of a VCPU. It informs the VCPU thread to inject a timer interrupt. + This request informs the vCPU to exit kvm_vcpu_block. It is used for + example from timer handlers that run on the host on behalf of a vCPU, + or in order to update the interrupt routing and ensure that assigned + devices will wake up the vCPU. KVM_REQ_UNHALT diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1e83359f286b..7f2e90db2050 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -51,6 +51,7 @@ /* PPC-specific vcpu->requests bit members */ #define KVM_REQ_WATCHDOG KVM_ARCH_REQ(0) #define KVM_REQ_EPR_EXIT KVM_ARCH_REQ(1) +#define KVM_REQ_PENDING_TIMER KVM_ARCH_REQ(2) #include diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5d91f2367c31..8120e8614b92 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1669,7 +1669,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(&apic->lapic_timer.pending); - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_make_request(KVM_REQ_UNBLOCK, vcpu); if (from_timer_fn) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 98538b1cb453..fe464b66898f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9501,7 +9501,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5d4b96b36ec0..76102efbf079 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -147,7 +147,7 @@ static inline bool is_error_page(struct page *page) */ #define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 #define KVM_REQUEST_ARCH_BASE 8 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5f40725144f5..37a2d500a148 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2929,6 +2929,8 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) goto out; if (signal_pending(current)) goto out; + if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu)) + goto out; ret = 0; out: -- cgit v1.2.3 From a2486020a82eefad686993695eb42d1b64f3f2fd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 26 May 2021 14:20:14 -0300 Subject: KVM: VMX: update vcpu posted-interrupt descriptor when assigning device For VMX, when a vcpu enters HLT emulation, pi_post_block will: 1) Add vcpu to per-cpu list of blocked vcpus. 2) Program the posted-interrupt descriptor "notification vector" to POSTED_INTR_WAKEUP_VECTOR With interrupt remapping, an interrupt will set the PIR bit for the vector programmed for the device on the CPU, test-and-set the ON bit on the posted interrupt descriptor, and if the ON bit is clear generate an interrupt for the notification vector. This way, the target CPU wakes upon a device interrupt and wakes up the target vcpu. Problem is that pi_post_block only programs the notification vector if kvm_arch_has_assigned_device() is true. Its possible for the following to happen: 1) vcpu V HLTs on pcpu P, kvm_arch_has_assigned_device is false, notification vector is not programmed 2) device is assigned to VM 3) device interrupts vcpu V, sets ON bit (notification vector not programmed, so pcpu P remains in idle) 4) vcpu 0 IPIs vcpu V (in guest), but since pi descriptor ON bit is set, kvm_vcpu_kick is skipped 5) vcpu 0 busy spins on vcpu V's response for several seconds, until RCU watchdog NMIs all vCPUs. To fix this, use the start_assignment kvm_x86_ops callback to kick vcpus out of the halt loop, so the notification vector is properly reprogrammed to the wakeup vector. Reported-by: Pei Zhang Signed-off-by: Marcelo Tosatti Message-Id: <20210526172014.GA29007@fuller.cnet> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 14 ++++++++++++++ arch/x86/kvm/vmx/posted_intr.h | 1 + arch/x86/kvm/vmx/vmx.c | 1 + virt/kvm/kvm_main.c | 1 + 4 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 459748680daf..5f81ef092bd4 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -237,6 +237,20 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) } +/* + * Bail out of the block loop if the VM has an assigned + * device, but the blocking vCPU didn't reconfigure the + * PI.NV to the wakeup vector, i.e. the assigned device + * came along after the initial check in pi_pre_block(). + */ +void vmx_pi_start_assignment(struct kvm *kvm) +{ + if (!irq_remapping_cap(IRQ_POSTING_CAP)) + return; + + kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); +} + /* * pi_update_irte - set IRTE for Posted-Interrupts * diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 0bdc41391c5b..7f7b2326caf5 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -95,5 +95,6 @@ void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); +void vmx_pi_start_assignment(struct kvm *kvm); #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4bceb5ca3a89..639ec3eba9b8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7721,6 +7721,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .nested_ops = &vmx_nested_ops, .update_pi_irte = pi_update_irte, + .start_assignment = vmx_pi_start_assignment, #ifdef CONFIG_X86_64 .set_hv_timer = vmx_set_hv_timer, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 37a2d500a148..6a6bc7af0e28 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -307,6 +307,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { return kvm_make_all_cpus_request_except(kvm, req, NULL); } +EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) -- cgit v1.2.3 From bedd9195df3dfea7165e7d6f7519a1568bc41936 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 26 May 2021 16:32:27 +0000 Subject: KVM: x86/mmu: Fix comment mentioning skip_4k This comment was left over from a previous version of the patch that introduced wrprot_gfn_range, when skip_4k was passed in instead of min_level. Signed-off-by: David Matlack Message-Id: <20210526163227.3113557-1-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 95eeb5ac6a8a..237317b1eddd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1192,9 +1192,9 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) } /* - * Remove write access from all the SPTEs mapping GFNs [start, end). If - * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. - * Returns true if an SPTE has been changed and the TLBs need to be flushed. + * Remove write access from all SPTEs at or above min_level that map GFNs + * [start, end). Returns true if an SPTE has been changed and the TLBs need to + * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) -- cgit v1.2.3 From 94a311ce248e0b53c76e110fd00511af47b72ffb Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Wed, 26 May 2021 22:16:01 +0530 Subject: x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types Add the (HWID, MCATYPE) tuples and names for new SMCA bank types. Also, add their respective error descriptions to the MCE decoding module edac_mce_amd. Also while at it, optimize the string names for some SMCA banks. [ bp: Drop repeated comments, explain why UMC_V2 is a separate entry. ] Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Borislav Petkov Reviewed-by: Yazen Ghannam Link: https://lkml.kernel.org/r/20210526164601.66228-1-nchatrad@amd.com --- arch/x86/include/asm/mce.h | 13 +++++--- arch/x86/kernel/cpu/mce/amd.c | 55 +++++++++++++++++++++------------- drivers/edac/mce_amd.c | 70 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ddfb3cad8dff..0607ec4f5091 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -305,7 +305,7 @@ extern void apei_mce_report_mem_error(int corrected, /* These may be used by multiple smca_hwid_mcatypes */ enum smca_bank_types { SMCA_LS = 0, /* Load Store */ - SMCA_LS_V2, /* Load Store */ + SMCA_LS_V2, SMCA_IF, /* Instruction Fetch */ SMCA_L2_CACHE, /* L2 Cache */ SMCA_DE, /* Decoder Unit */ @@ -314,17 +314,22 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ - SMCA_CS_V2, /* Coherent Slave */ + SMCA_CS_V2, SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ + SMCA_UMC_V2, SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ - SMCA_PSP_V2, /* Platform Security Processor */ + SMCA_PSP_V2, SMCA_SMU, /* System Management Unit */ - SMCA_SMU_V2, /* System Management Unit */ + SMCA_SMU_V2, SMCA_MP5, /* Microprocessor 5 Unit */ SMCA_NBIO, /* Northbridge IO Unit */ SMCA_PCIE, /* PCI Express Unit */ + SMCA_PCIE_V2, + SMCA_XGMI_PCS, /* xGMI PCS Unit */ + SMCA_XGMI_PHY, /* xGMI PHY Unit */ + SMCA_WAFL_PHY, /* WAFL PHY Unit */ N_SMCA_BANK_TYPES }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e486f96b3cb3..08831acc1d03 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -77,27 +77,29 @@ struct smca_bank_name { }; static struct smca_bank_name smca_names[] = { - [SMCA_LS] = { "load_store", "Load Store Unit" }, - [SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, - [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP] = { "psp", "Platform Security Processor" }, - [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU] = { "smu", "System Management Unit" }, - [SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, + [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_RESERVED] = { "reserved", "Reserved" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + + /* UMC v2 is separate because both of them can exist in a single system. */ + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, + [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, + [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, + [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, + [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, }; static const char *smca_get_name(enum smca_bank_types t) @@ -155,6 +157,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, + { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) }, /* Parameter Block MCA type */ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) }, @@ -175,6 +178,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = { /* PCI Express Unit MCA type */ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) }, + { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) }, + + /* xGMI PCS MCA type */ + { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) }, + + /* xGMI PHY MCA type */ + { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, + + /* WAFL PHY MCA type */ + { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5dd905a3f30c..43ba0f931629 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = { "AES SRAM ECC error", }; +static const char * const smca_umc2_mce_desc[] = { + "DRAM ECC error", + "Data poison error", + "SDP parity error", + "Reserved", + "Address/Command parity error", + "Write data parity error", + "DCQ SRAM ECC error", + "Reserved", + "Read data parity error", + "Rdb SRAM ECC error", + "RdRsp SRAM ECC error", + "LM32 MP errors", +}; + static const char * const smca_pb_mce_desc[] = { "An ECC error in the Parameter Block RAM array", }; @@ -400,6 +415,56 @@ static const char * const smca_pcie_mce_desc[] = { "CCIX Non-okay write response with data error", }; +static const char * const smca_pcie2_mce_desc[] = { + "SDP Parity Error logging", +}; + +static const char * const smca_xgmipcs_mce_desc[] = { + "Data Loss Error", + "Training Error", + "Flow Control Acknowledge Error", + "Rx Fifo Underflow Error", + "Rx Fifo Overflow Error", + "CRC Error", + "BER Exceeded Error", + "Tx Vcid Data Error", + "Replay Buffer Parity Error", + "Data Parity Error", + "Replay Fifo Overflow Error", + "Replay FIfo Underflow Error", + "Elastic Fifo Overflow Error", + "Deskew Error", + "Flow Control CRC Error", + "Data Startup Limit Error", + "FC Init Timeout Error", + "Recovery Timeout Error", + "Ready Serial Timeout Error", + "Ready Serial Attempt Error", + "Recovery Attempt Error", + "Recovery Relock Attempt Error", + "Replay Attempt Error", + "Sync Header Error", + "Tx Replay Timeout Error", + "Rx Replay Timeout Error", + "LinkSub Tx Timeout Error", + "LinkSub Rx Timeout Error", + "Rx CMD Pocket Error", +}; + +static const char * const smca_xgmiphy_mce_desc[] = { + "RAM ECC Error", + "ARC instruction buffer parity error", + "ARC data buffer parity error", + "PHY APB error", +}; + +static const char * const smca_waflphy_mce_desc[] = { + "RAM ECC Error", + "ARC instruction buffer parity error", + "ARC data buffer parity error", + "PHY APB error", +}; + struct smca_mce_desc { const char * const *descs; unsigned int num_descs; @@ -418,6 +483,7 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, + [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, @@ -426,6 +492,10 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, + [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) }, + [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) }, + [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) }, + [SMCA_WAFL_PHY] = { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc) }, }; static bool f12h_mc0_mce(u16 ec, u8 xec) -- cgit v1.2.3 From 0a470c843d233c2f6b68ae65357a246d9fb66178 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 15 Mar 2021 15:40:00 +0800 Subject: x86/pci: Return true/false (not 1/0) from bool functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return boolean values ("true" or "false") instead of 1 or 0 from bool functions. This fixes the following warnings from coccicheck: ./arch/x86/pci/mmconfig-shared.c:464:9-10: WARNING: return of 0/1 in function 'is_mmconf_reserved' with return type bool ./arch/x86/pci/mmconfig-shared.c:493:5-6: WARNING: return of 0/1 in function 'is_mmconf_reserved' with return type bool ./arch/x86/pci/mmconfig-shared.c:501:9-10: WARNING: return of 0/1 in function 'is_mmconf_reserved' with return type bool ./arch/x86/pci/mmconfig-shared.c:522:5-6: WARNING: return of 0/1 in function 'is_mmconf_reserved' with return type bool Link: https://lore.kernel.org/r/1615794000-102771-1-git-send-email-yang.lee@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- arch/x86/pci/mmconfig-shared.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index de6bf0e7e8f8..758cbfe55daa 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -461,7 +461,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, } if (size < (16UL<<20) && size != old_size) - return 0; + return false; if (dev) dev_info(dev, "MMCONFIG at %pR reserved in %s\n", @@ -493,7 +493,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, &cfg->res, (unsigned long) cfg->address); } - return 1; + return true; } static bool __ref @@ -501,7 +501,7 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e { if (!early && !acpi_disabled) { if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0)) - return 1; + return true; if (dev) dev_info(dev, FW_INFO @@ -522,14 +522,14 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e * _CBA method, just assume it's reserved. */ if (pci_mmcfg_running_state) - return 1; + return true; /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1); - return 0; + return false; } static void __init pci_mmcfg_reject_broken(int early) -- cgit v1.2.3 From 40cd0aae5957ec175b73dc17dce6079d33fa74f6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 27 May 2021 15:28:46 -0700 Subject: x86/mce: Include a MCi_MISC value in faked mce logs When BIOS reports memory errors to Linux using the ACPI/APEI error reporting method Linux creates a "struct mce" to pass to the normal reporting code path. The constructed record doesn't include a value for the "misc" field of the structure, and so mce_usable_address() says this record doesn't include a valid address. Net result is that functions like uc_decode_notifier() will just ignore this record instead of taking action to offline a page. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210527222846.931851-1-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/apei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index b58b85380ddb..0e3ae64d3b76 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -36,7 +36,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; -- cgit v1.2.3 From e87e46d5f3182f82d997641d95db01a7feacef92 Mon Sep 17 00:00:00 2001 From: Yuan Yao Date: Wed, 26 May 2021 14:38:28 +0800 Subject: KVM: X86: Use kvm_get_linear_rip() in single-step and #DB/#BP interception The kvm_get_linear_rip() handles x86/long mode cases well and has better readability, __kvm_set_rflags() also use the paired function kvm_is_linear_rip() to check the vcpu->arch.singlestep_rip set in kvm_arch_vcpu_ioctl_set_guest_debug(), so change the "CS.BASE + RIP" code in kvm_arch_vcpu_ioctl_set_guest_debug() and handle_exception_nmi() to this one. Signed-off-by: Yuan Yao Message-Id: <20210526063828.1173-1-yuan.yao@linux.intel.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 5 ++--- arch/x86/kvm/x86.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 639ec3eba9b8..50b42d7a8a11 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4843,7 +4843,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; - unsigned long cr2, rip, dr6; + unsigned long cr2, dr6; u32 vect_info; vect_info = vmx->idt_vectoring_info; @@ -4933,8 +4933,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; - rip = kvm_rip_read(vcpu); - kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe464b66898f..2d725567961f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10120,8 +10120,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); /* * Trigger an rflags update that will inject or remove the trace -- cgit v1.2.3 From da6393cdd8aaa354b3a2437cd73ebb34cac958e3 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 27 May 2021 17:01:36 -0700 Subject: KVM: X86: Fix warning caused by stale emulation context Reported by syzkaller: WARNING: CPU: 7 PID: 10526 at linux/arch/x86/kvm//x86.c:7621 x86_emulate_instruction+0x41b/0x510 [kvm] RIP: 0010:x86_emulate_instruction+0x41b/0x510 [kvm] Call Trace: kvm_mmu_page_fault+0x126/0x8f0 [kvm] vmx_handle_exit+0x11e/0x680 [kvm_intel] vcpu_enter_guest+0xd95/0x1b40 [kvm] kvm_arch_vcpu_ioctl_run+0x377/0x6a0 [kvm] kvm_vcpu_ioctl+0x389/0x630 [kvm] __x64_sys_ioctl+0x8e/0xd0 do_syscall_64+0x3c/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Commit 4a1e10d5b5d8 ("KVM: x86: handle hardware breakpoints during emulation()) adds hardware breakpoints check before emulation the instruction and parts of emulation context initialization, actually we don't have the EMULTYPE_NO_DECODE flag here and the emulation context will not be reused. Commit c8848cee74ff ("KVM: x86: set ctxt->have_exception in x86_decode_insn()) triggers the warning because it catches the stale emulation context has #UD, however, it is not during instruction decoding which should result in EMULATION_FAILED. This patch fixes it by moving the second part emulation context initialization into init_emulate_ctxt() and before hardware breakpoints check. The ctxt->ud will be dropped by a follow-up patch. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=134683fdd00000 Reported-by: syzbot+71271244f206d17f6441@syzkaller.appspotmail.com Fixes: 4a1e10d5b5d8 (KVM: x86: handle hardware breakpoints during emulation) Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-Id: <1622160097-37633-1-git-send-email-wanpengli@tencent.com> --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d725567961f..622cba2ed699 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7228,6 +7228,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } @@ -7563,11 +7568,6 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; r = x86_decode_insn(ctxt, insn, insn_len); -- cgit v1.2.3 From b35491e66c87946f380ebf8ab10a7e1f795e5ece Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 27 May 2021 17:01:37 -0700 Subject: KVM: X86: Kill off ctxt->ud ctxt->ud is consumed only by x86_decode_insn(), we can kill it off by passing emulation_type to x86_decode_insn() and dropping ctxt->ud altogether. Tracking that info in ctxt for literally one call is silly. Suggested-by: Sean Christopherson Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-Id: <1622160097-37633-2-git-send-email-wanpengli@tencent.com> --- arch/x86/kvm/emulate.c | 5 +++-- arch/x86/kvm/kvm_emulate.h | 3 +-- arch/x86/kvm/x86.c | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a0ccdb56076..5e5de05a8fbf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -5111,7 +5111,7 @@ done: return rc; } -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -5322,7 +5322,8 @@ done_prefixes: ctxt->execute = opcode.u.execute; - if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD))) + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && + likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; if (unlikely(ctxt->d & diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index f016838faedd..3e870bf9ca4d 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -314,7 +314,6 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ - bool ud; /* inject an #UD if host doesn't support insn */ bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; @@ -491,7 +490,7 @@ enum x86_intercept { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type); bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 622cba2ed699..1cd6d4685932 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7568,9 +7568,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, kvm_vcpu_check_breakpoint(vcpu, &r)) return r; - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; -- cgit v1.2.3 From 7d65f9e80646c595e8c853640a9d0768a33e204c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 May 2021 13:08:41 +0200 Subject: x86/apic: Mark _all_ legacy interrupts when IO/APIC is missing PIC interrupts do not support affinity setting and they can end up on any online CPU. Therefore, it's required to mark the associated vectors as system-wide reserved. Otherwise, the corresponding irq descriptors are copied to the secondary CPUs but the vectors are not marked as assigned or reserved. This works correctly for the IO/APIC case. When the IO/APIC is disabled via config, kernel command line or lack of enumeration then all legacy interrupts are routed through the PIC, but nothing marks them as system-wide reserved vectors. As a consequence, a subsequent allocation on a secondary CPU can result in allocating one of these vectors, which triggers the BUG() in apic_update_vector() because the interrupt descriptor slot is not empty. Imran tried to work around that by marking those interrupts as allocated when a CPU comes online. But that's wrong in case that the IO/APIC is available and one of the legacy interrupts, e.g. IRQ0, has been switched to PIC mode because then marking them as allocated will fail as they are already marked as system vectors. Stay consistent and update the legacy vectors after attempting IO/APIC initialization and mark them as system vectors in case that no IO/APIC is available. Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Imran Khan Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210519233928.2157496-1-imran.f.khan@oracle.com --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/apic/vector.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 412b51e059c8..48067af94678 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(void) extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); +extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4a39fb429f15..d262811ce14b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool upmode) end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); + lapic_update_legacy_vectors(); } #ifdef CONFIG_UP_LATE_INIT diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 6dbdc7c22bb7..fb67ed5e7e6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -738,6 +738,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace) irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); } +void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + void __init lapic_assign_system_vectors(void) { unsigned int i, vector = 0; -- cgit v1.2.3 From 4a0e3ff30980b7601b13dd3b7ee275212b852843 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 26 May 2021 06:58:47 -0700 Subject: perf/x86/intel/uncore: Fix a kernel WARNING triggered by maxcpus=1 A kernel WARNING may be triggered when setting maxcpus=1. The uncore counters are Die-scope. When probing a PCI device, only the BUS information can be retrieved. The uncore driver has to maintain a mapping table used to calculate the logical Die ID from a given BUS#. Before the patch ba9506be4e40, the mapping table stores the mapping information from the BUS# -> a Physical Socket ID. To calculate the logical die ID, perf does, - In snbep_pci2phy_map_init(), retrieve the BUS# -> a Physical Socket ID from the UBOX PCI configure space. - Calculate the mapping information (a BUS# -> a Physical Socket ID) for the other PCI BUS. - In the uncore_pci_probe(), get the physical Socket ID from a given BUS and the mapping table. - Calculate the logical Die ID Since only the logical Die ID is required, with the patch ba9506be4e40, the mapping table stores the mapping information from the BUS# -> a logical Die ID. Now perf does, - In snbep_pci2phy_map_init(), retrieve the BUS# -> a Physical Socket ID from the UBOX PCI configure space. - Calculate the logical Die ID - Calculate the mapping information (a BUS# -> a logical Die ID) for the other PCI BUS. - In the uncore_pci_probe(), get the logical die ID from a given BUS and the mapping table. When calculating the logical Die ID, -1 may be returned, especially when maxcpus=1. Here, -1 means the logical Die ID is not found. But when calculating the mapping information for the other PCI BUS, -1 indicates that it's the other PCI BUS that requires the calculation of the mapping. The driver will mistakenly do the calculation. Uses the -ENODEV to indicate the case which the logical Die ID is not found. The driver will not mess up the mapping table anymore. Fixes: ba9506be4e40 ("perf/x86/intel/uncore: Store the logical die id instead of the physical die id.") Reported-by: John Donnelly Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Donnelly Tested-by: John Donnelly Link: https://lkml.kernel.org/r/1622037527-156028-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 63f097289a84..1587d3289743 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1406,6 +1406,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool die_id = i; else die_id = topology_phys_to_logical_pkg(i); + if (die_id < 0) + die_id = -ENODEV; map->pbus_to_dieid[bus] = die_id; break; } @@ -1452,14 +1454,14 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool i = -1; if (reverse) { for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_dieid[bus] >= 0) + if (map->pbus_to_dieid[bus] != -1) i = map->pbus_to_dieid[bus]; else map->pbus_to_dieid[bus] = i; } } else { for (bus = 0; bus <= 255; bus++) { - if (map->pbus_to_dieid[bus] >= 0) + if (map->pbus_to_dieid[bus] != -1) i = map->pbus_to_dieid[bus]; else map->pbus_to_dieid[bus] = i; -- cgit v1.2.3 From 280b68a3b3b96b027fcdeb5a3916a8e2aaf84d03 Mon Sep 17 00:00:00 2001 From: Pu Wen Date: Fri, 28 May 2021 16:14:17 +0800 Subject: x86/cstate: Allow ACPI C1 FFH MWAIT use on Hygon systems Hygon systems support the MONITOR/MWAIT instructions and these can be used for ACPI C1 in the same way as on AMD and Intel systems. The BIOS declares a C1 state in _CST to use FFH and CPUID_Fn00000005_EDX is non-zero on Hygon systems. Allow ffh_cstate_init() to succeed on Hygon systems to default using FFH MWAIT instead of HALT for ACPI C1. Signed-off-by: Pu Wen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210528081417.31474-1-puwen@hygon.cn --- arch/x86/kernel/acpi/cstate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 49ae4e1ac9cd..7de599eba7f0 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -197,7 +197,8 @@ static int __init ffh_cstate_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; if (c->x86_vendor != X86_VENDOR_INTEL && - c->x86_vendor != X86_VENDOR_AMD) + c->x86_vendor != X86_VENDOR_AMD && + c->x86_vendor != X86_VENDOR_HYGON) return -1; cpu_cstate_entry = alloc_percpu(struct cstate_entry); -- cgit v1.2.3 From 9a90ed065a155d13db0d0ffeaad5cc54e51c90c6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 May 2021 11:02:26 +0200 Subject: x86/thermal: Fix LVT thermal setup for SMI delivery mode There are machines out there with added value crap^WBIOS which provide an SMI handler for the local APIC thermal sensor interrupt. Out of reset, the BSP on those machines has something like 0x200 in that APIC register (timestamps left in because this whole issue is timing sensitive): [ 0.033858] read lvtthmr: 0x330, val: 0x200 which means: - bit 16 - the interrupt mask bit is clear and thus that interrupt is enabled - bits [10:8] have 010b which means SMI delivery mode. Now, later during boot, when the kernel programs the local APIC, it soft-disables it temporarily through the spurious vector register: setup_local_APIC: ... /* * If this comes from kexec/kcrash the APIC might be enabled in * SPIV. Soft disable it before doing further initialization. */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); which means (from the SDM): "10.4.7.2 Local APIC State After It Has Been Software Disabled ... * The mask bits for all the LVT entries are set. Attempts to reset these bits will be ignored." And this happens too: [ 0.124111] APIC: Switch to symmetric I/O mode setup [ 0.124117] lvtthmr 0x200 before write 0xf to APIC 0xf0 [ 0.124118] lvtthmr 0x10200 after write 0xf to APIC 0xf0 This results in CPU 0 soft lockups depending on the placement in time when the APIC soft-disable happens. Those soft lockups are not 100% reproducible and the reason for that can only be speculated as no one tells you what SMM does. Likely, it confuses the SMM code that the APIC is disabled and the thermal interrupt doesn't doesn't fire at all, leading to CPU 0 stuck in SMM forever... Now, before 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()") due to how the APIC_LVTTHMR was read before APIC initialization in mcheck_intel_therm_init(), it would read the value with the mask bit 16 clear and then intel_init_thermal() would replicate it onto the APs and all would be peachy - the thermal interrupt would remain enabled. But that commit moved that reading to a later moment in intel_init_thermal(), resulting in reading APIC_LVTTHMR on the BSP too late and with its interrupt mask bit set. Thus, revert back to the old behavior of reading the thermal LVT register before the APIC gets initialized. Fixes: 4f432e8bb15b ("x86/mce: Get rid of mcheck_intel_therm_init()") Reported-by: James Feeney Signed-off-by: Borislav Petkov Cc: Cc: Zhang Rui Cc: Srinivas Pandruvada Link: https://lkml.kernel.org/r/YKIqDdFNaXYd39wz@zn.tnic --- arch/x86/include/asm/thermal.h | 4 +++- arch/x86/kernel/setup.c | 9 +++++++++ drivers/thermal/intel/therm_throt.c | 15 +++++++++++---- 3 files changed, 23 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h index ddbdefd5b94f..91a7b6687c3b 100644 --- a/arch/x86/include/asm/thermal.h +++ b/arch/x86/include/asm/thermal.h @@ -3,11 +3,13 @@ #define _ASM_X86_THERMAL_H #ifdef CONFIG_X86_THERMAL_VECTOR +void therm_lvt_init(void); void intel_init_thermal(struct cpuinfo_x86 *c); bool x86_thermal_enabled(void); void intel_thermal_interrupt(void); #else -static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } +static inline void therm_lvt_init(void) { } +static inline void intel_init_thermal(struct cpuinfo_x86 *c) { } #endif #endif /* _ASM_X86_THERMAL_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 72920af0b3c0..ff653d608d5f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -1226,6 +1227,14 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); + /* + * This needs to run before setup_local_APIC() which soft-disables the + * local APIC temporarily and that masks the thermal LVT interrupt, + * leading to softlockups on machines which have configured SMI + * interrupt delivery. + */ + therm_lvt_init(); + mcheck_init(); register_refined_jiffies(CLOCK_TICK_RATE); diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index f8e882592ba5..99abdc03c44c 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -621,6 +621,17 @@ bool x86_thermal_enabled(void) return atomic_read(&therm_throt_en); } +void __init therm_lvt_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -630,10 +641,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (!intel_thermal_supported(c)) return; - /* On the BSP? */ - if (c == &boot_cpu_data) - lvtthmr_init = apic_read(APIC_LVTTHMR); - /* * First check if its enabled already, in which case there might * be some SMM goo which handles it, so we can't even put a handler -- cgit v1.2.3 From 848ff3768684701a4ce73a2ec0e5d438d4e2b0da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 1 Jun 2021 06:09:03 -0700 Subject: perf/x86/intel/uncore: Fix M2M event umask for Ice Lake server Perf tool errors out with the latest event list for the Ice Lake server. event syntax error: 'unc_m2m_imc_reads.to_pmm' \___ value too big for format, maximum is 255 The same as the Snow Ridge server, the M2M uncore unit in the Ice Lake server has the unit mask extension field as well. Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Reported-by: Jin Yao Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1622552943-119174-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 1587d3289743..3a75a2c601c2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5099,9 +5099,10 @@ static struct intel_uncore_type icx_uncore_m2m = { .perf_ctr = SNR_M2M_PCI_PMON_CTR0, .event_ctl = SNR_M2M_PCI_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, .ops = &snr_m2m_uncore_pci_ops, - .format_group = &skx_uncore_format_group, + .format_group = &snr_m2m_uncore_format_group, }; static struct attribute *icx_upi_uncore_formats_attr[] = { -- cgit v1.2.3 From cbcddaa33d7e11a053cb80a4a635c023b4f8b906 Mon Sep 17 00:00:00 2001 From: Andrew Cooper Date: Fri, 14 May 2021 14:59:20 +0100 Subject: perf/x86/rapl: Use CPUID bit on AMD and Hygon parts AMD and Hygon CPUs have a CPUID bit for RAPL. Drop the fam17h suffix as it is stale already. Make use of this instead of a model check to work more nicely in virtual environments where RAPL typically isn't available. [ bp: drop the ../cpu/powerflags.c hunk which is superfluous as the "rapl" bit name appears already in flags. ] Signed-off-by: Andrew Cooper Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210514135920.16093-1-andrew.cooper3@citrix.com --- arch/x86/events/rapl.c | 6 ++---- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++++ arch/x86/kernel/cpu/hygon.c | 4 ++++ 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 84a1042c3b01..85feafacc445 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -764,13 +764,14 @@ static struct rapl_model model_spr = { .rapl_msrs = intel_rapl_spr_msrs, }; -static struct rapl_model model_amd_fam17h = { +static struct rapl_model model_amd_hygon = { .events = BIT(PERF_RAPL_PKG), .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { + X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), @@ -803,9 +804,6 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), - X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), - X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), - X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ac37830ae941..81269c73a0dc 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -108,7 +108,7 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ -/* free ( 3*32+29) */ +#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2d11384dc9ab..da57b96fafbe 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -646,6 +646,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #else diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 0bd6c74e3ba1..6d50136f7ab9 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -260,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c) if (c->x86_power & BIT(12)) set_cpu_cap(c, X86_FEATURE_ACC_POWER); + /* Bit 14 indicates the Runtime Average Power Limit interface. */ + if (c->x86_power & BIT(14)) + set_cpu_cap(c, X86_FEATURE_RAPL); + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); #endif -- cgit v1.2.3 From 450605c28d571eddca39a65fdbc1338add44c6d9 Mon Sep 17 00:00:00 2001 From: Praveen Kumar Date: Mon, 31 May 2021 13:10:46 +0530 Subject: x86/hyperv: fix logical processor creation Microsoft Hypervisor expects the logical processor index to be the same as CPU's index during logical processor creation. Using cpu_physical_id confuses hypervisor's scheduler. That causes the root partition not boot when core scheduler is used. This patch removes the call to cpu_physical_id and uses the CPU index directly for bringing up logical processor. This scheme works for both classic scheduler and core scheduler. Fixes: 333abaf5abb3 (x86/hyperv: implement and use hv_smp_prepare_cpus) Signed-off-by: Praveen Kumar Link: https://lore.kernel.org/r/20210531074046.113452-1-kumarpraveen@linux.microsoft.com Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 22f13343b5da..4fa0a4280895 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -236,7 +236,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) for_each_present_cpu(i) { if (i == 0) continue; - ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i); BUG_ON(ret); } -- cgit v1.2.3 From 7ee0e638a526b2d1f09c714f86d82dfd7628f322 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Jun 2021 21:30:56 +0200 Subject: x86/alternative: Align insn bytes vertically For easier inspection which bytes have changed. For example: feat: 7*32+12, old: (__x86_indirect_thunk_r10+0x0/0x20 (ffffffff81c02480) len: 17), repl: (ffffffff897813aa, len: 17) ffffffff81c02480: old_insn: 41 ff e2 90 90 90 90 90 90 90 90 90 90 90 90 90 90 ffffffff897813aa: rpl_insn: e8 07 00 00 00 f3 90 0f ae e8 eb f9 4c 89 14 24 c3 ffffffff81c02480: final_insn: e8 07 00 00 00 f3 90 0f ae e8 eb f9 4c 89 14 24 c3 No functional changes. Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210601193713.16190-1-bp@alien8.de --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 75c752b0628c..227c4a8b145a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -273,8 +273,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; -- cgit v1.2.3 From 9bfecd05833918526cc7357d55e393393440c5fa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 May 2021 11:17:30 +0200 Subject: x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid() While digesting the XSAVE-related horrors which got introduced with the supervisor/user split, the recent addition of ENQCMD-related functionality got on the radar and turned out to be similarly broken. update_pasid(), which is only required when X86_FEATURE_ENQCMD is available, is invoked from two places: 1) From switch_to() for the incoming task 2) Via a SMP function call from the IOMMU/SMV code #1 is half-ways correct as it hacks around the brokenness of get_xsave_addr() by enforcing the state to be 'present', but all the conditionals in that code are completely pointless for that. Also the invocation is just useless overhead because at that point it's guaranteed that TIF_NEED_FPU_LOAD is set on the incoming task and all of this can be handled at return to user space. #2 is broken beyond repair. The comment in the code claims that it is safe to invoke this in an IPI, but that's just wishful thinking. FPU state of a running task is protected by fregs_lock() which is nothing else than a local_bh_disable(). As BH-disabled regions run usually with interrupts enabled the IPI can hit a code section which modifies FPU state and there is absolutely no guarantee that any of the assumptions which are made for the IPI case is true. Also the IPI is sent to all CPUs in mm_cpumask(mm), but the IPI is invoked with a NULL pointer argument, so it can hit a completely unrelated task and unconditionally force an update for nothing. Worse, it can hit a kernel thread which operates on a user space address space and set a random PASID for it. The offending commit does not cleanly revert, but it's sufficient to force disable X86_FEATURE_ENQCMD and to remove the broken update_pasid() code to make this dysfunctional all over the place. Anything more complex would require more surgery and none of the related functions outside of the x86 core code are blatantly wrong, so removing those would be overkill. As nothing enables the PASID bit in the IA32_XSS MSR yet, which is required to make this actually work, this cannot result in a regression except for related out of tree train-wrecks, but they are broken already today. Fixes: 20f0afd1fb3d ("x86/mmu: Allocate/free a PASID") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87mtsd6gr9.ffs@nanos.tec.linutronix.de --- arch/x86/include/asm/disabled-features.h | 7 ++-- arch/x86/include/asm/fpu/api.h | 6 +--- arch/x86/include/asm/fpu/internal.h | 7 ---- arch/x86/kernel/fpu/xstate.c | 57 -------------------------------- 4 files changed, 3 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index b7dd944dc867..8f28fafa98b3 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -56,11 +56,8 @@ # define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) #endif -#ifdef CONFIG_IOMMU_SUPPORT -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif +/* Force disable because it's broken beyond repair */ +#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) #ifdef CONFIG_X86_SGX # define DISABLE_SGX 0 diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index ed33a14188f6..23bef08a8388 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); */ #define PASID_DISABLED 0 -#ifdef CONFIG_IOMMU_SUPPORT -/* Update current's PASID MSR/state by mm's PASID. */ -void update_pasid(void); -#else static inline void update_pasid(void) { } -#endif + #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8d33ad80704f..ceeba9f63172 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) pkru_val = pk->pkru; } __write_pkru(pkru_val); - - /* - * Expensive PASID MSR write will be avoided in update_pasid() because - * TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated - * unless it's different from mm->pasid to reduce overhead. - */ - update_pasid(); } #endif /* _ASM_X86_FPU_INTERNAL_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a85c64000218..d0eef963aad1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ - -#ifdef CONFIG_IOMMU_SUPPORT -void update_pasid(void) -{ - u64 pasid_state; - u32 pasid; - - if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) - return; - - if (!current->mm) - return; - - pasid = READ_ONCE(current->mm->pasid); - /* Set the valid bit in the PASID MSR/state only for valid pasid. */ - pasid_state = pasid == PASID_DISABLED ? - pasid : pasid | MSR_IA32_PASID_VALID; - - /* - * No need to hold fregs_lock() since the task's fpstate won't - * be changed by others (e.g. ptrace) while the task is being - * switched to or is in IPI. - */ - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - /* The MSR is active and can be directly updated. */ - wrmsrl(MSR_IA32_PASID, pasid_state); - } else { - struct fpu *fpu = ¤t->thread.fpu; - struct ia32_pasid_state *ppasid_state; - struct xregs_state *xsave; - - /* - * The CPU's xstate registers are not currently active. Just - * update the PASID state in the memory buffer here. The - * PASID MSR will be loaded when returning to user mode. - */ - xsave = &fpu->state.xsave; - xsave->header.xfeatures |= XFEATURE_MASK_PASID; - ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID); - /* - * Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state - * won't be NULL and no need to check its value. - * - * Only update the task's PASID state when it's different - * from the mm's pasid. - */ - if (ppasid_state->pasid != pasid_state) { - /* - * Invalid fpregs so that state restoring will pick up - * the PASID state. - */ - __fpu_invalidate_fpregs_state(fpu); - ppasid_state->pasid = pasid_state; - } - } -} -#endif /* CONFIG_IOMMU_SUPPORT */ -- cgit v1.2.3 From 2b31e8ed96b260ce2c22bd62ecbb9458399e3b62 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Jun 2021 17:51:22 +0200 Subject: x86/alternative: Optimize single-byte NOPs at an arbitrary position Up until now the assumption was that an alternative patching site would have some instructions at the beginning and trailing single-byte NOPs (0x90) padding. Therefore, the patching machinery would go and optimize those single-byte NOPs into longer ones. However, this assumption is broken on 32-bit when code like hv_do_hypercall() in hyperv_init() would use the ratpoline speculation killer CALL_NOSPEC. The 32-bit version of that macro would align certain insns to 16 bytes, leading to the compiler issuing a one or more single-byte NOPs, depending on the holes it needs to fill for alignment. That would lead to the warning in optimize_nops() to fire: ------------[ cut here ]------------ Not a NOP at 0xc27fb598 WARNING: CPU: 0 PID: 0 at arch/x86/kernel/alternative.c:211 optimize_nops.isra.13 due to that function verifying whether all of the following bytes really are single-byte NOPs. Therefore, carve out the NOP padding into a separate function and call it for each NOP range beginning with a single-byte NOP. Fixes: 23c1ad538f4f ("x86/alternatives: Optimize optimize_nops()") Reported-by: Richard Narron Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://bugzilla.kernel.org/show_bug.cgi?id=213301 Link: https://lkml.kernel.org/r/20210601212125.17145-1-bp@alien8.de --- arch/x86/kernel/alternative.c | 64 +++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6974b5174495..6fe5b44fcbc9 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -182,42 +182,70 @@ done: n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } +/* + * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * + * @instr: instruction byte stream + * @instrlen: length of the above + * @off: offset within @instr where the first NOP has been detected + * + * Return: number of NOPs found (and replaced). + */ +static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) +{ + unsigned long flags; + int i = off, nnops; + + while (i < instrlen) { + if (instr[i] != 0x90) + break; + + i++; + } + + nnops = i - off; + + if (nnops <= 1) + return nnops; + + local_irq_save(flags); + add_nops(instr + off, nnops); + local_irq_restore(flags); + + DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + + return nnops; +} + /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { - unsigned long flags; struct insn insn; - int nop, i = 0; + int i = 0; /* - * Jump over the non-NOP insns, the remaining bytes must be single-byte - * NOPs, optimize them. + * Jump over the non-NOP insns and optimize single-byte NOPs into bigger + * ones. */ for (;;) { if (insn_decode_kernel(&insn, &instr[i])) return; + /* + * See if this and any potentially following NOPs can be + * optimized. + */ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - break; - - if ((i += insn.length) >= a->instrlen) - return; - } + i += optimize_nops_range(instr, a->instrlen, i); + else + i += insn.length; - for (nop = i; i < a->instrlen; i++) { - if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i])) + if (i >= a->instrlen) return; } - - local_irq_save(flags); - add_nops(instr + nop, i - nop); - local_irq_restore(flags); - - DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ", - instr, nop, a->instrlen); } /* -- cgit v1.2.3 From f1d4d47c5851b348b7713007e152bc68b94d728b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 1 Jun 2021 10:53:52 +0300 Subject: x86/setup: Always reserve the first 1M of RAM There are BIOSes that are known to corrupt the memory under 1M, or more precisely under 640K because the memory above 640K is anyway reserved for the EGA/VGA frame buffer and BIOS. To prevent usage of the memory that will be potentially clobbered by the kernel, the beginning of the memory is always reserved. The exact size of the reserved area is determined by CONFIG_X86_RESERVE_LOW build time and the "reservelow=" command line option. The reserved range may be from 4K to 640K with the default of 64K. There are also configurations that reserve the entire 1M range, like machines with SandyBridge graphic devices or systems that enable crash kernel. In addition to the potentially clobbered memory, EBDA of unknown size may be as low as 128K and the memory above that EBDA start is also reserved early. It would have been possible to reserve the entire range under 1M unless for the real mode trampoline that must reside in that area. To accommodate placement of the real mode trampoline and keep the memory safe from being clobbered by BIOS, reserve the first 64K of RAM before memory allocations are possible and then, after the real mode trampoline is allocated, reserve the entire range from 0 to 1M. Update trim_snb_memory() and reserve_real_mode() to avoid redundant reservations of the same memory range. Also make sure the memory under 1M is not getting freed by efi_free_boot_services(). [ bp: Massage commit message and comments. ] Fixes: a799c2bd29d1 ("x86/setup: Consolidate early memory reservations") Signed-off-by: Mike Rapoport Signed-off-by: Borislav Petkov Tested-by: Hugh Dickins Link: https://bugzilla.kernel.org/show_bug.cgi?id=213177 Link: https://lkml.kernel.org/r/20210601075354.5149-2-rppt@kernel.org --- arch/x86/kernel/setup.c | 35 +++++++++++++++++++++-------------- arch/x86/platform/efi/quirks.c | 12 ++++++++++++ arch/x86/realmode/init.c | 14 ++++++++------ 3 files changed, 41 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ff653d608d5f..1e720626069a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -638,11 +638,11 @@ static void __init trim_snb_memory(void) * them from accessing certain memory ranges, namely anything below * 1M and in the pages listed in bad_pages[] above. * - * To avoid these pages being ever accessed by SNB gfx devices - * reserve all memory below the 1 MB mark and bad_pages that have - * not already been reserved at boot time. + * To avoid these pages being ever accessed by SNB gfx devices reserve + * bad_pages that have not already been reserved at boot time. + * All memory below the 1 MB mark is anyway reserved later during + * setup_arch(), so there is no need to reserve it here. */ - memblock_reserve(0, 1<<20); for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { if (memblock_reserve(bad_pages[i], PAGE_SIZE)) @@ -734,14 +734,14 @@ static void __init early_reserve_memory(void) * The first 4Kb of memory is a BIOS owned area, but generally it is * not listed as such in the E820 table. * - * Reserve the first memory page and typically some additional - * memory (64KiB by default) since some BIOSes are known to corrupt - * low memory. See the Kconfig help text for X86_RESERVE_LOW. + * Reserve the first 64K of memory since some BIOSes are known to + * corrupt low memory. After the real mode trampoline is allocated the + * rest of the memory below 640k is reserved. * * In addition, make sure page 0 is always reserved because on * systems with L1TF its contents can be leaked to user processes. */ - memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); + memblock_reserve(0, SZ_64K); early_reserve_initrd(); @@ -752,6 +752,7 @@ static void __init early_reserve_memory(void) reserve_ibft_region(); reserve_bios_regions(); + trim_snb_memory(); } /* @@ -1082,14 +1083,20 @@ void __init setup_arch(char **cmdline_p) (max_pfn_mapped< Date: Tue, 1 Jun 2021 16:52:03 +0800 Subject: x86/fault: Don't send SIGSEGV twice on SEGV_PKUERR __bad_area_nosemaphore() calls both force_sig_pkuerr() and force_sig_fault() when handling SEGV_PKUERR. This does not cause problems because the second signal is filtered by the legacy_queue() check in __send_signal() because in both cases, the signal is SIGSEGV, the second one seeing that the first one is already pending. This causes the kernel to do unnecessary work so send the signal only once for SEGV_PKUERR. [ bp: Massage commit message. ] Fixes: 9db812dbb29d ("signal/x86: Call force_sig_pkuerr from __bad_area_nosemaphore") Suggested-by: "Eric W. Biederman" Signed-off-by: Jiashuo Liang Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20210601085203.40214-1-liangjs@pku.edu.cn --- arch/x86/mm/fault.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1c548ad00752..6bda7f67d737 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -836,8 +836,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); - - force_sig_fault(SIGSEGV, si_code, (void __user *)address); + else + force_sig_fault(SIGSEGV, si_code, (void __user *)address); local_irq_disable(); } -- cgit v1.2.3 From 314a1e1eabea5b86532e90e0d4e217fa88471e3b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 3 Jun 2021 16:08:10 -0700 Subject: x86/pkeys: Skip 'init_pkru' debugfs file creation when pkeys not supported The PKRU hardware is permissive by default: all reads and writes are allowed. The in-kernel policy is restrictive by default: deny all unnecessary access until explicitly requested. That policy can be modified with a debugfs file: "x86/init_pkru". This file is created unconditionally, regardless of PKRU support in the hardware, which is a little silly. Avoid creating the file when pkeys are not available. This also removes the need to check for pkey support at runtime, which would be required once the new pkey modification infrastructure is put in place later in this series. Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210603230810.113FF3F2@viggo.jf.intel.com --- arch/x86/mm/pkeys.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index a2332eef66e9..4a67b922bce1 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -192,6 +192,10 @@ static const struct file_operations fops_init_pkru = { static int __init create_init_pkru_value(void) { + /* Do not expose the file if pkeys are not supported. */ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return 0; + debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_init_pkru); return 0; -- cgit v1.2.3 From 009767dbf42ac0dbe3cf48c1ee224f6b778aa85a Mon Sep 17 00:00:00 2001 From: Pu Wen Date: Wed, 2 Jun 2021 15:02:07 +0800 Subject: x86/sev: Check SME/SEV support in CPUID first The first two bits of the CPUID leaf 0x8000001F EAX indicate whether SEV or SME is supported, respectively. It's better to check whether SEV or SME is actually supported before accessing the MSR_AMD64_SEV to check whether SEV or SME is enabled. This is both a bare-metal issue and a guest/VM issue. Since the first generation Hygon Dhyana CPU doesn't support the MSR_AMD64_SEV, reading that MSR results in a #GP - either directly from hardware in the bare-metal case or via the hypervisor (because the RDMSR is actually intercepted) in the guest/VM case, resulting in a failed boot. And since this is very early in the boot phase, rdmsrl_safe()/native_read_msr_safe() can't be used. So check the CPUID bits first, before accessing the MSR. [ tlendacky: Expand and improve commit message. ] [ bp: Massage commit message. ] Fixes: eab696d8e8b9 ("x86/sev: Do not require Hypervisor CPUID bit for SEV guests") Signed-off-by: Pu Wen Signed-off-by: Borislav Petkov Acked-by: Tom Lendacky Cc: # v5.10+ Link: https://lkml.kernel.org/r/20210602070207.2480-1-puwen@hygon.cn --- arch/x86/mm/mem_encrypt_identity.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index a9639f663d25..470b20208430 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -504,10 +504,6 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* Check the SEV MSR whether SEV or SME is enabled */ - sev_status = __rdmsr(MSR_AMD64_SEV); - feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; - /* * Check for the SME/SEV feature: * CPUID Fn8000_001F[EAX] @@ -519,11 +515,16 @@ void __init sme_enable(struct boot_params *bp) eax = 0x8000001f; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); - if (!(eax & feature_mask)) + /* Check whether SEV or SME is supported */ + if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT))) return; me_mask = 1UL << (ebx & 0x3f); + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { /* -- cgit v1.2.3 From 92638b4e1b47f97d7269e74465dedf73096f777d Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 2 Jun 2021 16:52:27 -0700 Subject: mm: arch: remove indirection level in alloc_zeroed_user_highpage_movable() In an upcoming change we would like to add a flag to GFP_HIGHUSER_MOVABLE so that it would no longer be an OR of GFP_HIGHUSER and __GFP_MOVABLE. This poses a problem for alloc_zeroed_user_highpage_movable() which passes __GFP_MOVABLE into an arch-specific __alloc_zeroed_user_highpage() hook which ORs in GFP_HIGHUSER. Since __alloc_zeroed_user_highpage() is only ever called from alloc_zeroed_user_highpage_movable(), we can remove one level of indirection here. Remove __alloc_zeroed_user_highpage(), make alloc_zeroed_user_highpage_movable() the hook, and use GFP_HIGHUSER_MOVABLE in the hook implementations so that they will pick up the new flag that we are going to add. Signed-off-by: Peter Collingbourne Link: https://linux-review.googlesource.com/id/Ic6361c657b2cdcd896adbe0cf7cb5a7fbb1ed7bf Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20210602235230.3928842-2-pcc@google.com Signed-off-by: Will Deacon --- arch/alpha/include/asm/page.h | 6 +++--- arch/arm64/include/asm/page.h | 6 +++--- arch/ia64/include/asm/page.h | 6 +++--- arch/m68k/include/asm/page_no.h | 6 +++--- arch/s390/include/asm/page.h | 6 +++--- arch/x86/include/asm/page.h | 6 +++--- include/linux/highmem.h | 35 ++++++++--------------------------- 7 files changed, 26 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 268f99b4602b..18f48a6f2ff6 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -17,9 +17,9 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vmaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vmaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 012cffc574e8..e1fc0f60e79f 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -28,9 +28,9 @@ void copy_user_highpage(struct page *to, struct page *from, void copy_highpage(struct page *to, struct page *from); #define __HAVE_ARCH_COPY_HIGHPAGE -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index f4dc81fa7146..1b990466d540 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h @@ -82,16 +82,16 @@ do { \ } while (0) -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ ({ \ struct page *page = alloc_page_vma( \ - GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr); \ + GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr); \ if (page) \ flush_dcache_page(page); \ page; \ }) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index 8d0f862ee9d7..c9d0d84158a4 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -13,9 +13,9 @@ extern unsigned long memory_end; #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index cc98f9b78fd4..479dc76e0eca 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -68,9 +68,9 @@ static inline void copy_page(void *to, void *from) #define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 7555b48803a8..4d5810c8fab7 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } -#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ - alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#define alloc_zeroed_user_highpage_movable(vma, vaddr) \ + alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 832b49b50c7b..54d0643b8fcf 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -152,28 +152,24 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr) } #endif -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE /** - * __alloc_zeroed_user_highpage - Allocate a zeroed HIGHMEM page for a VMA with caller-specified movable GFP flags - * @movableflags: The GFP flags related to the pages future ability to move like __GFP_MOVABLE + * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move * @vma: The VMA the page is to be allocated for * @vaddr: The virtual address the page will be inserted into * - * This function will allocate a page for a VMA but the caller is expected - * to specify via movableflags whether the page will be movable in the - * future or not + * This function will allocate a page for a VMA that the caller knows will + * be able to migrate in the future using move_pages() or reclaimed * * An architecture may override this function by defining - * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and providing their own + * __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE and providing their own * implementation. */ static inline struct page * -__alloc_zeroed_user_highpage(gfp_t movableflags, - struct vm_area_struct *vma, - unsigned long vaddr) +alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, + unsigned long vaddr) { - struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags, - vma, vaddr); + struct page *page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (page) clear_user_highpage(page, vaddr); @@ -182,21 +178,6 @@ __alloc_zeroed_user_highpage(gfp_t movableflags, } #endif -/** - * alloc_zeroed_user_highpage_movable - Allocate a zeroed HIGHMEM page for a VMA that the caller knows can move - * @vma: The VMA the page is to be allocated for - * @vaddr: The virtual address the page will be inserted into - * - * This function will allocate a page for a VMA that the caller knows will - * be able to migrate in the future using move_pages() or reclaimed - */ -static inline struct page * -alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, - unsigned long vaddr) -{ - return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); -} - static inline void clear_highpage(struct page *page) { void *kaddr = kmap_atomic(page); -- cgit v1.2.3 From a4d7e8ae4a541557d7a2c815835b786c18c3613c Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 2 Jun 2021 14:36:44 -0700 Subject: Drivers: hv: Move Hyper-V extended capability check to arch neutral code The extended capability query code is currently under arch/x86, but it is architecture neutral, and is used by arch neutral code in the Hyper-V balloon driver. Hence the balloon driver fails to build on other architectures. Fix by moving the ext cap code out from arch/x86. Because it is also called from built-in architecture specific code, it can't be in a module, so the Makefile treats as built-in even when CONFIG_HYPERV is "m". Also drivers/Makefile is tweaked because this is the first occurrence of a Hyper-V file that is built-in even when CONFIG_HYPERV is "m". While here, update the hypercall status check to use the new helper function instead of open coding. No functional change. Signed-off-by: Michael Kelley Reviewed-by: Sunil Muthuswamy Link: https://lore.kernel.org/r/1622669804-2016-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 47 --------------------------------- drivers/Makefile | 2 +- drivers/hv/Makefile | 3 +++ drivers/hv/hv_common.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 48 deletions(-) create mode 100644 drivers/hv/hv_common.c (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index bb0ae4b5c00f..6952e219cba3 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -614,50 +614,3 @@ bool hv_is_isolation_supported(void) return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; } EXPORT_SYMBOL_GPL(hv_is_isolation_supported); - -/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ -bool hv_query_ext_cap(u64 cap_query) -{ - /* - * The address of the 'hv_extended_cap' variable will be used as an - * output parameter to the hypercall below and so it should be - * compatible with 'virt_to_phys'. Which means, it's address should be - * directly mapped. Use 'static' to keep it compatible; stack variables - * can be virtually mapped, making them imcompatible with - * 'virt_to_phys'. - * Hypercall input/output addresses should also be 8-byte aligned. - */ - static u64 hv_extended_cap __aligned(8); - static bool hv_extended_cap_queried; - u64 status; - - /* - * Querying extended capabilities is an extended hypercall. Check if the - * partition supports extended hypercall, first. - */ - if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) - return false; - - /* Extended capabilities do not change at runtime. */ - if (hv_extended_cap_queried) - return hv_extended_cap & cap_query; - - status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, - &hv_extended_cap); - - /* - * The query extended capabilities hypercall should not fail under - * any normal circumstances. Avoid repeatedly making the hypercall, on - * error. - */ - hv_extended_cap_queried = true; - status &= HV_HYPERCALL_RESULT_MASK; - if (status != HV_STATUS_SUCCESS) { - pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", - status); - return false; - } - - return hv_extended_cap & cap_query; -} -EXPORT_SYMBOL_GPL(hv_query_ext_cap); diff --git a/drivers/Makefile b/drivers/Makefile index 5a6d613e868d..1c2e1acbd098 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -161,7 +161,7 @@ obj-$(CONFIG_SOUNDWIRE) += soundwire/ # Virtualization drivers obj-$(CONFIG_VIRT_DRIVERS) += virt/ -obj-$(CONFIG_HYPERV) += hv/ +obj-$(subst m,y,$(CONFIG_HYPERV)) += hv/ obj-$(CONFIG_PM_DEVFREQ) += devfreq/ obj-$(CONFIG_EXTCON) += extcon/ diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 94daf8240c95..d76df5c8c2a9 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -11,3 +11,6 @@ hv_vmbus-y := vmbus_drv.o \ channel_mgmt.o ring_buffer.o hv_trace.o hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o + +# Code that must be built-in +obj-$(subst m,y,$(CONFIG_HYPERV)) += hv_common.o diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c new file mode 100644 index 000000000000..f0053c786891 --- /dev/null +++ b/drivers/hv/hv_common.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Architecture neutral utility routines for interacting with + * Hyper-V. This file is specifically for code that must be + * built-in to the kernel image when CONFIG_HYPERV is set + * (vs. being in a module) because it is called from architecture + * specific code under arch/. + * + * Copyright (C) 2021, Microsoft, Inc. + * + * Author : Michael Kelley + */ + +#include +#include +#include +#include +#include + + +/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */ +bool hv_query_ext_cap(u64 cap_query) +{ + /* + * The address of the 'hv_extended_cap' variable will be used as an + * output parameter to the hypercall below and so it should be + * compatible with 'virt_to_phys'. Which means, it's address should be + * directly mapped. Use 'static' to keep it compatible; stack variables + * can be virtually mapped, making them imcompatible with + * 'virt_to_phys'. + * Hypercall input/output addresses should also be 8-byte aligned. + */ + static u64 hv_extended_cap __aligned(8); + static bool hv_extended_cap_queried; + u64 status; + + /* + * Querying extended capabilities is an extended hypercall. Check if the + * partition supports extended hypercall, first. + */ + if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS)) + return false; + + /* Extended capabilities do not change at runtime. */ + if (hv_extended_cap_queried) + return hv_extended_cap & cap_query; + + status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL, + &hv_extended_cap); + + /* + * The query extended capabilities hypercall should not fail under + * any normal circumstances. Avoid repeatedly making the hypercall, on + * error. + */ + hv_extended_cap_queried = true; + if (!hv_result_success(status)) { + pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n", + status); + return false; + } + + return hv_extended_cap & cap_query; +} +EXPORT_SYMBOL_GPL(hv_query_ext_cap); -- cgit v1.2.3 From 1a6a9044b96729abacede172d7591c714a5b81d1 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 1 Jun 2021 10:53:53 +0300 Subject: x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options The CONFIG_X86_RESERVE_LOW build time and reservelow= command line option allowed to control the amount of memory under 1M that would be reserved at boot to avoid using memory that can be potentially clobbered by BIOS. Since the entire range under 1M is always reserved there is no need for these options anymore and they can be removed. Signed-off-by: Mike Rapoport Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210601075354.5149-3-rppt@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 5 ----- arch/x86/Kconfig | 29 ------------------------- arch/x86/kernel/setup.c | 24 -------------------- 3 files changed, 58 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb89dbdedc46..d7d813032c51 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4775,11 +4775,6 @@ Reserves a hole at the top of the kernel virtual address space. - reservelow= [X86] - Format: nn[K] - Set the amount of memory to reserve for BIOS at - the bottom of the address space. - reset_devices [KNL] Force drivers to reset the underlying device during initialization. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..86dae426798b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1693,35 +1693,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. -config X86_RESERVE_LOW - int "Amount of low memory, in kilobytes, to reserve for the BIOS" - default 64 - range 4 640 - help - Specify the amount of low memory to reserve for the BIOS. - - The first page contains BIOS data structures that the kernel - must not use, so that page must always be reserved. - - By default we reserve the first 64K of physical RAM, as a - number of BIOSes are known to corrupt that memory range - during events such as suspend/resume or monitor cable - insertion, so it must not be used by the kernel. - - You can set this to 4 if you are absolutely sure that you - trust the BIOS to get all its memory reservations and usages - right. If you know your BIOS have problems beyond the - default 64K area, you can set this to 640 to avoid using the - entire low memory range. - - If you have doubts about the BIOS (e.g. suspend/resume does - not work or there's kernel crashes after certain hardware - hotplug events) then you might want to enable - X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check - typical corruption patterns. - - Leave this to the default value of 64 if you are unsure. - config MATH_EMULATION bool depends on MODIFY_LDT_SYSCALL diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1e720626069a..7638ac6c3d80 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -695,30 +695,6 @@ static void __init e820_add_kernel_range(void) e820__range_add(start, size, E820_TYPE_RAM); } -static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; - -static int __init parse_reservelow(char *p) -{ - unsigned long long size; - - if (!p) - return -EINVAL; - - size = memparse(p, &p); - - if (size < 4096) - size = 4096; - - if (size > 640*1024) - size = 640*1024; - - reserve_low = size; - - return 0; -} - -early_param("reservelow", parse_reservelow); - static void __init early_reserve_memory(void) { /* -- cgit v1.2.3 From 65ffb3d69ed3da28af85b1e4b2aaacd6c13ba28b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 31 May 2021 18:42:58 +0200 Subject: quota: Wire up quotactl_fd syscall Wire up the quotactl_fd syscall. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- arch/alpha/kernel/syscalls/syscall.tbl | 2 +- arch/arm/tools/syscall.tbl | 2 +- arch/arm64/include/asm/unistd32.h | 3 ++- arch/ia64/kernel/syscalls/syscall.tbl | 2 +- arch/m68k/kernel/syscalls/syscall.tbl | 2 +- arch/microblaze/kernel/syscalls/syscall.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n32.tbl | 2 +- arch/mips/kernel/syscalls/syscall_n64.tbl | 2 +- arch/mips/kernel/syscalls/syscall_o32.tbl | 2 +- arch/parisc/kernel/syscalls/syscall.tbl | 2 +- arch/powerpc/kernel/syscalls/syscall.tbl | 2 +- arch/s390/kernel/syscalls/syscall.tbl | 2 +- arch/sh/kernel/syscalls/syscall.tbl | 2 +- arch/sparc/kernel/syscalls/syscall.tbl | 2 +- arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- arch/xtensa/kernel/syscalls/syscall.tbl | 2 +- 17 files changed, 18 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 3000a2e8ee21..a17687ed4b51 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -482,7 +482,7 @@ 550 common process_madvise sys_process_madvise 551 common epoll_pwait2 sys_epoll_pwait2 552 common mount_setattr sys_mount_setattr -# 553 reserved for quotactl_path +553 common quotactl_fd sys_quotactl_fd 554 common landlock_create_ruleset sys_landlock_create_ruleset 555 common landlock_add_rule sys_landlock_add_rule 556 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 28e03b5fec00..c5df1179fc5d 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -456,7 +456,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 5dab69d2c22b..99ffcafc736c 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -893,7 +893,8 @@ __SYSCALL(__NR_process_madvise, sys_process_madvise) __SYSCALL(__NR_epoll_pwait2, compat_sys_epoll_pwait2) #define __NR_mount_setattr 442 __SYSCALL(__NR_mount_setattr, sys_mount_setattr) -/* 443 is reserved for quotactl_path */ +#define __NR_quotactl_fd 443 +__SYSCALL(__NR_quotactl_fd, sys_quotactl_fd) #define __NR_landlock_create_ruleset 444 __SYSCALL(__NR_landlock_create_ruleset, sys_landlock_create_ruleset) #define __NR_landlock_add_rule 445 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index bb11fe4c875a..6d07742c57b8 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -363,7 +363,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 79c2d24c89dd..541bc1b3a8f9 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -442,7 +442,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b11395a20c20..a176faca2927 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -448,7 +448,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 9220909526f9..c2d2e19abea8 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -381,7 +381,7 @@ 440 n32 process_madvise sys_process_madvise 441 n32 epoll_pwait2 compat_sys_epoll_pwait2 442 n32 mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 n32 quotactl_fd sys_quotactl_fd 444 n32 landlock_create_ruleset sys_landlock_create_ruleset 445 n32 landlock_add_rule sys_landlock_add_rule 446 n32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 9cd1c34f31b5..ac653d08b1ea 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -357,7 +357,7 @@ 440 n64 process_madvise sys_process_madvise 441 n64 epoll_pwait2 sys_epoll_pwait2 442 n64 mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 n64 quotactl_fd sys_quotactl_fd 444 n64 landlock_create_ruleset sys_landlock_create_ruleset 445 n64 landlock_add_rule sys_landlock_add_rule 446 n64 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index d560c467a8c6..253f2cd70b6b 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -430,7 +430,7 @@ 440 o32 process_madvise sys_process_madvise 441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 o32 mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 o32 quotactl_fd sys_quotactl_fd 444 o32 landlock_create_ruleset sys_landlock_create_ruleset 445 o32 landlock_add_rule sys_landlock_add_rule 446 o32 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index aabc37f8cae3..e26187b9ab87 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -440,7 +440,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 8f052ff4058c..aef2a290e71a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -522,7 +522,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 0690263df1dd..64d51ab5a8b4 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -445,7 +445,7 @@ 440 common process_madvise sys_process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 0b91499ebdcf..e0a70be77d84 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -445,7 +445,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index e34cc30ef22c..603f5a821502 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -488,7 +488,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4bbc267fb36b..fba2f615119a 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,7 +447,7 @@ 440 i386 process_madvise sys_process_madvise 441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 442 i386 mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 i386 quotactl_fd sys_quotactl_fd 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index ce18119ea0d0..af973e400053 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,7 +364,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index fd2f30227d96..235d67d6ceb4 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -413,7 +413,7 @@ 440 common process_madvise sys_process_madvise 441 common epoll_pwait2 sys_epoll_pwait2 442 common mount_setattr sys_mount_setattr -# 443 reserved for quotactl_path +443 common quotactl_fd sys_quotactl_fd 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self -- cgit v1.2.3 From 23721c8e92f73f9f89e7362c50c2996a5c9ad483 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 1 Jun 2021 10:53:54 +0300 Subject: x86/crash: Remove crash_reserve_low_1M() The entire memory range under 1M is unconditionally reserved in setup_arch(), so there is no need for crash_reserve_low_1M() anymore. Remove this function. Signed-off-by: Mike Rapoport Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210601075354.5149-4-rppt@kernel.org --- arch/x86/include/asm/crash.h | 6 ------ arch/x86/kernel/crash.c | 13 ------------- 2 files changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index f58de66091e5..8b6bd63530dc 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params); void crash_smp_send_stop(void); -#ifdef CONFIG_KEXEC_CORE -void __init crash_reserve_low_1M(void); -#else -static inline void __init crash_reserve_low_1M(void) { } -#endif - #endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 54ce999ed321..e8326a8d1c5d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } -/* - * When the crashkernel option is specified, only use the low - * 1M for the real mode trampoline. - */ -void __init crash_reserve_low_1M(void) -{ - if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) - return; - - memblock_reserve(0, 1<<20); - pr_info("Reserving the low 1M of memory for crashkernel\n"); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) -- cgit v1.2.3 From 31b77c70d9bc04d3b024ea56c129523f9edc1328 Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Tue, 8 Jun 2021 11:04:11 +0530 Subject: x86/gpu: add JasperLake to gen11 early quirks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's reserve JSL stolen memory for graphics. JasperLake is a gen11 platform which is compatible with ICL/EHL changes. This was missed in commit 24ea098b7c0d ("drm/i915/jsl: Split EHL/JSL platform info and PCI ids") V2: - Added maintainer list in cc - Added patch ref in commit message V1: - Added Cc: x86@kernel.org Fixes: 24ea098b7c0d ("drm/i915/jsl: Split EHL/JSL platform info and PCI ids") Cc: # v5.11+ Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: José Roberto de Souza Signed-off-by: Tejas Upadhyay Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20210608053411.394166-1-tejaskumarx.surendrakumar.upadhyay@intel.com --- arch/x86/kernel/early-quirks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b553ffe9b985..38837dad46e6 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -549,6 +549,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_CNL_IDS(&gen9_early_ops), INTEL_ICL_11_IDS(&gen11_early_ops), INTEL_EHL_IDS(&gen11_early_ops), + INTEL_JSL_IDS(&gen11_early_ops), INTEL_TGL_12_IDS(&gen11_early_ops), INTEL_RKL_IDS(&gen11_early_ops), INTEL_ADLS_IDS(&gen11_early_ops), -- cgit v1.2.3 From 8d651ee9c71bb12fc0c8eb2786b66cbe5aa3e43b Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 8 Jun 2021 11:54:33 +0200 Subject: x86/ioremap: Map EFI-reserved memory as encrypted for SEV Some drivers require memory that is marked as EFI boot services data. In order for this memory to not be re-used by the kernel after ExitBootServices(), efi_mem_reserve() is used to preserve it by inserting a new EFI memory descriptor and marking it with the EFI_MEMORY_RUNTIME attribute. Under SEV, memory marked with the EFI_MEMORY_RUNTIME attribute needs to be mapped encrypted by Linux, otherwise the kernel might crash at boot like below: EFI Variables Facility v0.08 2004-May-17 general protection fault, probably for non-canonical address 0x3597688770a868b2: 0000 [#1] SMP NOPTI CPU: 13 PID: 1 Comm: swapper/0 Not tainted 5.12.4-2-default #1 openSUSE Tumbleweed Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:efi_mokvar_entry_next [...] Call Trace: efi_mokvar_sysfs_init ? efi_mokvar_table_init do_one_initcall ? __kmalloc kernel_init_freeable ? rest_init kernel_init ret_from_fork Expand the __ioremap_check_other() function to additionally check for this other type of boot data reserved at runtime and indicate that it should be mapped encrypted for an SEV guest. [ bp: Massage commit message. ] Fixes: 58c909022a5a ("efi: Support for MOK variable config table") Reported-by: Joerg Roedel Signed-off-by: Tom Lendacky Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Tested-by: Joerg Roedel Cc: # 5.10+ Link: https://lkml.kernel.org/r/20210608095439.12668-2-joro@8bytes.org --- arch/x86/mm/ioremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 12c686c65ea9..60ade7dd71bd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -118,7 +118,9 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des if (!IS_ENABLED(CONFIG_EFI)) return; - if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA) + if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA || + (efi_mem_type(addr) == EFI_BOOT_SERVICES_DATA && + efi_mem_attributes(addr) & EFI_MEMORY_RUNTIME)) desc->flags |= IORES_MAP_ENCRYPTED; } -- cgit v1.2.3 From 4f13d471e5d11034d56161af56d0f9396bc0b384 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 7 Jun 2021 06:15:32 +0000 Subject: KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca821cee Commit 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") uses the local stack to allocate the structures used to communicate with the PSP, which were earlier being kzalloced. This breaks SEV live migration for computing the SEND_START session length and SEND_UPDATE_DATA query length as session_len and trans_len and hdr_len fields are not zeroed respectively for the above commands before issuing the SEV Firmware API call, hence the firmware returns incorrect session length and update data header or trans length. Also the SEV Firmware API returns SEV_RET_INVALID_LEN firmware error for these length query API calls, and the return value and the firmware error needs to be passed to the userspace as it is, so need to remove the return check in the KVM code. Signed-off-by: Ashish Kalra Message-Id: <20210607061532.27459-1-Ashish.Kalra@amd.com> Fixes: 238eca821cee ("KVM: SVM: Allocate SEV command structures on local stack") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5bc887e9a986..e0ce5da97fc2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_start data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error); - if (ret < 0) - return ret; params->session_len = data.session_len; if (copy_to_user((void __user *)(uintptr_t)argp->data, params, @@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp, struct sev_data_send_update_data data; int ret; + memset(&data, 0, sizeof(data)); data.handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error); - if (ret < 0) - return ret; params->hdr_len = data.hdr_len; params->trans_len = data.trans_len; -- cgit v1.2.3 From e898da784aed0ea65f7672d941c01dc9b79e6299 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 7 Jun 2021 00:19:43 -0700 Subject: KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer According to the SDM 10.5.4.1: A write of 0 to the initial-count register effectively stops the local APIC timer, in both one-shot and periodic mode. However, the lapic timer oneshot/periodic mode which is emulated by vmx-preemption timer doesn't stop by writing 0 to TMICT since vmx->hv_deadline_tsc is still programmed and the guest will receive the spurious timer interrupt later. This patch fixes it by also cancelling the vmx-preemption timer when writing 0 to the initial-count register. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li Message-Id: <1623050385-100988-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8120e8614b92..6d72d8f43310 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic) static void cancel_hv_timer(struct kvm_lapic *apic); +static void cancel_apic_timer(struct kvm_lapic *apic) +{ + hrtimer_cancel(&apic->lapic_timer.timer); + preempt_disable(); + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_timer(apic); + preempt_enable(); +} + static void apic_update_lvtt(struct kvm_lapic *apic) { u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) & @@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic) if (apic->lapic_timer.timer_mode != timer_mode) { if (apic_lvtt_tscdeadline(apic) != (timer_mode == APIC_LVT_TIMER_TSCDEADLINE)) { - hrtimer_cancel(&apic->lapic_timer.timer); - preempt_disable(); - if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_timer(apic); - preempt_enable(); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; @@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) if (apic_lvtt_tscdeadline(apic)) break; - hrtimer_cancel(&apic->lapic_timer.timer); + cancel_apic_timer(apic); kvm_lapic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); break; -- cgit v1.2.3 From b1bd5cba3306691c771d558e94baa73e8b0b96b7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 3 Jun 2021 13:24:55 +0800 Subject: KVM: X86: MMU: Use the correct inherited permissions to get shadow page When computing the access permissions of a shadow page, use the effective permissions of the walk up to that point, i.e. the logic AND of its parents' permissions. Two guest PxE entries that point at the same table gfn need to be shadowed with different shadow pages if their parents' permissions are different. KVM currently uses the effective permissions of the last non-leaf entry for all non-leaf entries. Because all non-leaf SPTEs have full ("uwx") permissions, and the effective permissions are recorded only in role.access and merged into the leaves, this can lead to incorrect reuse of a shadow page and eventually to a missing guest protection page fault. For example, here is a shared pagetable: pgd[] pud[] pmd[] virtual address pointers /->pmd1(u--)->pte1(uw-)->page1 <- ptr1 (u--) /->pud1(uw-)--->pmd2(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) pgd-| (shared pmd[] as above) \->pud2(u--)--->pmd1(u--)->pte1(uw-)->page1 <- ptr3 (u--) \->pmd2(uw-)->pte2(uw-)->page2 <- ptr4 (u--) pud1 and pud2 point to the same pmd table, so: - ptr1 and ptr3 points to the same page. - ptr2 and ptr4 points to the same page. (pud1 and pud2 here are pud entries, while pmd1 and pmd2 here are pmd entries) - First, the guest reads from ptr1 first and KVM prepares a shadow page table with role.access=u--, from ptr1's pud1 and ptr1's pmd1. "u--" comes from the effective permissions of pgd, pud1 and pmd1, which are stored in pt->access. "u--" is used also to get the pagetable for pud1, instead of "uw-". - Then the guest writes to ptr2 and KVM reuses pud1 which is present. The hypervisor set up a shadow page for ptr2 with pt->access is "uw-" even though the pud1 pmd (because of the incorrect argument to kvm_mmu_get_page in the previous step) has role.access="u--". - Then the guest reads from ptr3. The hypervisor reuses pud1's shadow pmd for pud2, because both use "u--" for their permissions. Thus, the shadow pmd already includes entries for both pmd1 and pmd2. - At last, the guest writes to ptr4. This causes no vmexit or pagefault, because pud1's shadow page structures included an "uw-" page even though its role.access was "u--". Any kind of shared pagetable might have the similar problem when in virtual machine without TDP enabled if the permissions are different from different ancestors. In order to fix the problem, we change pt->access to be an array, and any access in it will not include permissions ANDed from child ptes. The test code is: https://lore.kernel.org/kvm/20210603050537.19605-1-jiangshanlai@gmail.com/ Remember to test it with TDP disabled. The problem had existed long before the commit 41074d07c78b ("KVM: MMU: Fix inherited permissions for emulated guest pte updates"), and it is hard to find which is the culprit. So there is no fixes tag here. Signed-off-by: Lai Jiangshan Message-Id: <20210603052455.21023-1-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Fixes: cea0f0e7ea54 ("[PATCH] KVM: MMU: Shadow page table caching") Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 4 ++-- arch/x86/kvm/mmu/paging_tmpl.h | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 5bfe28b0728e..20d85daed395 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -171,8 +171,8 @@ Shadow pages contain the following information: shadow pages) so role.quadrant takes values in the range 0..3. Each quadrant maps 1GB virtual address space. role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. + Inherited guest access permissions from the parent ptes in the form uwx. + Note execute permission is positive, not negative. role.invalid: The page is invalid and should not be used. It is a root page that is currently pinned (by a cpu hardware register pointing to it); once it is diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 70b7e44e3035..823a5919f9fa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -90,8 +90,8 @@ struct guest_walker { gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; - unsigned pt_access; - unsigned pte_access; + unsigned int pt_access[PT_MAX_FULL_LEVELS]; + unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; @@ -418,13 +418,15 @@ retry_walk: } walker->ptes[walker->level - 1] = pte; + + /* Convert to ACC_*_MASK flags for struct guest_walker. */ + walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!is_last_gpte(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) @@ -463,7 +465,8 @@ retry_walk: } pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, walker->pte_access, walker->pt_access); + __func__, (u64)pte, walker->pte_access, + walker->pt_access[walker->level - 1]); return 1; error: @@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; - unsigned direct_access, access = gw->pt_access; + unsigned int direct_access, access; int top_level, level, req_level, ret; gfn_t base_gfn = gw->gfn; @@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, sp = NULL; if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; + access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, false, access); } -- cgit v1.2.3 From af3511ff7fa2107d6410831f3d71030f5e8d2b25 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:46:28 +0800 Subject: KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior In record_steal_time(), st->preempted is read twice, and trace_kvm_pv_tlb_flush() might output result inconsistent if kvm_vcpu_flush_tlb_guest() see a different st->preempted later. It is a very trivial problem and hardly has actual harm and can be avoided by reseting and reading st->preempted in atomic way via xchg(). Signed-off-by: Lai Jiangshan Message-Id: <20210531174628.10265-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1cd6d4685932..e2144eedaf79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3101,9 +3101,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = xchg(&st->preempted, 0); + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); } else { st->preempted = 0; -- cgit v1.2.3 From f31500b0d437a2464ca5972d8f5439e156b74960 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 10:57:48 -0700 Subject: KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message Use the __string() machinery provided by the tracing subystem to make a copy of the string literals consumed by the "nested VM-Enter failed" tracepoint. A complete copy is necessary to ensure that the tracepoint can't outlive the data/memory it consumes and deference stale memory. Because the tracepoint itself is defined by kvm, if kvm-intel and/or kvm-amd are built as modules, the memory holding the string literals defined by the vendor modules will be freed when the module is unloaded, whereas the tracepoint and its data in the ring buffer will live until kvm is unloaded (or "indefinitely" if kvm is built-in). This bug has existed since the tracepoint was added, but was recently exposed by a new check in tracing to detect exactly this type of bug. fmt: '%s%s ' current_buffer: ' vmx_dirty_log_t-140127 [003] .... kvm_nested_vmenter_failed: ' WARNING: CPU: 3 PID: 140134 at kernel/trace/trace.c:3759 trace_check_vprintf+0x3be/0x3e0 CPU: 3 PID: 140134 Comm: less Not tainted 5.13.0-rc1-ce2e73ce600a-req #184 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:trace_check_vprintf+0x3be/0x3e0 Code: <0f> 0b 44 8b 4c 24 1c e9 a9 fe ff ff c6 44 02 ff 00 49 8b 97 b0 20 RSP: 0018:ffffa895cc37bcb0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffa895cc37bd08 RCX: 0000000000000027 RDX: 0000000000000027 RSI: 00000000ffffdfff RDI: ffff9766cfad74f8 RBP: ffffffffc0a041d4 R08: ffff9766cfad74f0 R09: ffffa895cc37bad8 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffc0a041d4 R13: ffffffffc0f4dba8 R14: 0000000000000000 R15: ffff976409f2c000 FS: 00007f92fa200740(0000) GS:ffff9766cfac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000559bd11b0000 CR3: 000000019fbaa002 CR4: 00000000001726e0 Call Trace: trace_event_printf+0x5e/0x80 trace_raw_output_kvm_nested_vmenter_failed+0x3a/0x60 [kvm] print_trace_line+0x1dd/0x4e0 s_show+0x45/0x150 seq_read_iter+0x2d5/0x4c0 seq_read+0x106/0x150 vfs_read+0x98/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Steven Rostedt Fixes: 380e0055bc7e ("KVM: nVMX: trace nested VM-Enter failures detected by H/W") Signed-off-by: Sean Christopherson Reviewed-by: Steven Rostedt (VMware) Message-Id: <20210607175748.674002-1-seanjc@google.com> --- arch/x86/kvm/trace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a61c015870e3..4f839148948b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed, TP_ARGS(msg, err), TP_STRUCT__entry( - __field(const char *, msg) + __string(msg, msg) __field(u32, err) ), TP_fast_assign( - __entry->msg = msg; + __assign_str(msg, msg); __entry->err = err; ), - TP_printk("%s%s", __entry->msg, !__entry->err ? "" : + TP_printk("%s%s", __get_str(msg), !__entry->err ? "" : __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); -- cgit v1.2.3 From ec35d1d93bf8976f0668cb1026ea8c7d7bcad3c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 8 Jun 2021 22:17:10 +0200 Subject: x86/setup: Document that Windows reserves the first MiB It does so unconditionally too, on Intel and AMD machines, to work around BIOS bugs, as confirmed by Microsoft folks (see Link for full details). Reflow the paragraph, while at it. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/MWHPR21MB159330952629D36EEDE706B3D7379@MWHPR21MB1593.namprd21.prod.outlook.com --- arch/x86/kernel/setup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7638ac6c3d80..85acd22f8022 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1060,17 +1060,18 @@ void __init setup_arch(char **cmdline_p) #endif /* - * Find free memory for the real mode trampoline and place it - * there. - * If there is not enough free memory under 1M, on EFI-enabled - * systems there will be additional attempt to reclaim the memory - * for the real mode trampoline at efi_free_boot_services(). + * Find free memory for the real mode trampoline and place it there. If + * there is not enough free memory under 1M, on EFI-enabled systems + * there will be additional attempt to reclaim the memory for the real + * mode trampoline at efi_free_boot_services(). * - * Unconditionally reserve the entire first 1M of RAM because - * BIOSes are know to corrupt low memory and several - * hundred kilobytes are not worth complex detection what memory gets - * clobbered. Moreover, on machines with SandyBridge graphics or in - * setups that use crashkernel the entire 1M is reserved anyway. + * Unconditionally reserve the entire first 1M of RAM because BIOSes + * are known to corrupt low memory and several hundred kilobytes are not + * worth complex detection what memory gets clobbered. Windows does the + * same thing for very similar reasons. + * + * Moreover, on machines with SandyBridge graphics or in setups that use + * crashkernel the entire 1M is reserved anyway. */ reserve_real_mode(); -- cgit v1.2.3 From b53e84eed08b88fd3ff59e5c2a7f1a69d4004e32 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 1 Jun 2021 01:22:56 +0800 Subject: KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync When using shadow paging, unload the guest MMU when emulating a guest TLB flush to ensure all roots are synchronized. From the guest's perspective, flushing the TLB ensures any and all modifications to its PTEs will be recognized by the CPU. Note, unloading the MMU is overkill, but is done to mirror KVM's existing handling of INVPCID(all) and ensure the bug is squashed. Future cleanup can be done to more precisely synchronize roots when servicing a guest TLB flush. If TDP is enabled, synchronizing the MMU is unnecessary even if nested TDP is in play, as a "legacy" TLB flush from L1 does not invalidate L1's TDP mappings. For EPT, an explicit INVEPT is required to invalidate guest-physical mappings; for NPT, guest mappings are always tagged with an ASID and thus can only be invalidated via the VMCB's ASID control. This bug has existed since the introduction of KVM_VCPU_FLUSH_TLB. It was only recently exposed after Linux guests stopped flushing the local CPU's TLB prior to flushing remote TLBs (see commit 4ce94eabac16, "x86/mm/tlb: Flush remote and local TLBs concurrently"), but is also visible in Windows 10 guests. Tested-by: Maxim Levitsky Reviewed-by: Maxim Levitsky Fixes: f38a7b75267f ("KVM: X86: support paravirtualized help for TLB shootdowns") Signed-off-by: Lai Jiangshan [sean: massaged comment and changelog] Message-Id: <20210531172256.2908-1-jiangshanlai@gmail.com> Signed-off-by: Sean Christopherson Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e2144eedaf79..9dd23bdfc6cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Unload the + * entire MMU here and the subsequent load will sync the + * shadow page tables, and also flush the TLB. + */ + kvm_mmu_unload(vcpu); + return; + } + static_call(kvm_x86_tlb_flush_guest)(vcpu); } -- cgit v1.2.3 From 484cea4f362e1eeb5c869abbfb5f90eae6421b38 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Jun 2021 16:36:18 +0200 Subject: x86/fpu: Prevent state corruption in __fpu__restore_sig() The non-compacted slowpath uses __copy_from_user() and copies the entire user buffer into the kernel buffer, verbatim. This means that the kernel buffer may now contain entirely invalid state on which XRSTOR will #GP. validate_user_xstate_header() can detect some of that corruption, but that leaves the onus on callers to clear the buffer. Prior to XSAVES support, it was possible just to reinitialize the buffer, completely, but with supervisor states that is not longer possible as the buffer clearing code split got it backwards. Fixing that is possible but not corrupting the state in the first place is more robust. Avoid corruption of the kernel XSAVE buffer by using copy_user_to_xstate() which validates the XSAVE header contents before copying the actual states to the kernel. copy_user_to_xstate() was previously only called for compacted-format kernel buffers, but it works for both compacted and non-compacted forms. Using it for the non-compacted form is slower because of multiple __copy_from_user() operations, but that cost is less important than robust code in an already slow path. [ Changelog polished by Dave Hansen ] Fixes: b860eb8dce59 ("x86/fpu/xstate: Define new functions for clearing fpregs and xstates") Reported-by: syzbot+2067e764dbcd10721e2e@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Rik van Riel Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210608144345.611833074@linutronix.de --- arch/x86/kernel/fpu/signal.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a4ec65317a7f..d5bc96a536c2 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -405,14 +405,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave() && !fx_only) { u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; - if (using_compacted_format()) { - ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); - } else { - ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - - if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_user_xstate_header(&fpu->state.xsave.header); - } + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) goto err_out; -- cgit v1.2.3 From d8778e393afa421f1f117471144f8ce6deb6953a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 8 Jun 2021 16:36:19 +0200 Subject: x86/fpu: Invalidate FPU state after a failed XRSTOR from a user buffer Both Intel and AMD consider it to be architecturally valid for XRSTOR to fail with #PF but nonetheless change the register state. The actual conditions under which this might occur are unclear [1], but it seems plausible that this might be triggered if one sibling thread unmaps a page and invalidates the shared TLB while another sibling thread is executing XRSTOR on the page in question. __fpu__restore_sig() can execute XRSTOR while the hardware registers are preserved on behalf of a different victim task (using the fpu_fpregs_owner_ctx mechanism), and, in theory, XRSTOR could fail but modify the registers. If this happens, then there is a window in which __fpu__restore_sig() could schedule out and the victim task could schedule back in without reloading its own FPU registers. This would result in part of the FPU state that __fpu__restore_sig() was attempting to load leaking into the victim task's user-visible state. Invalidate preserved FPU registers on XRSTOR failure to prevent this situation from corrupting any state. [1] Frequent readers of the errata lists might imagine "complex microarchitectural conditions". Fixes: 1d731e731c4c ("x86/fpu: Add a fastpath to __fpu__restore_sig()") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Rik van Riel Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210608144345.758116583@linutronix.de --- arch/x86/kernel/fpu/signal.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index d5bc96a536c2..4ab9aeb9a963 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -369,6 +369,25 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_unlock(); return 0; } + + /* + * The above did an FPU restore operation, restricted to + * the user portion of the registers, and failed, but the + * microcode might have modified the FPU registers + * nevertheless. + * + * If the FPU registers do not belong to current, then + * invalidate the FPU register state otherwise the task might + * preempt current and return to user space with corrupted + * FPU registers. + * + * In case current owns the FPU registers then no further + * action is required. The fixup below will handle it + * correctly. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); } else { /* -- cgit v1.2.3 From 12f7764ac61200e32c916f038bdc08f884b0b604 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Jun 2021 16:36:20 +0200 Subject: x86/process: Check PF_KTHREAD and not current->mm for kernel threads switch_fpu_finish() checks current->mm as indicator for kernel threads. That's wrong because kernel threads can temporarily use a mm of a user process via kthread_use_mm(). Check the task flags for PF_KTHREAD instead. Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Rik van Riel Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210608144345.912645927@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index ceeba9f63172..18382ac1ecc4 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -578,7 +578,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * PKRU state is switched eagerly because it needs to be valid before we * return to userland e.g. for a copy_to_user() operation. */ - if (current->mm) { + if (!(current->flags & PF_KTHREAD)) { pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); if (pk) pkru_val = pk->pkru; -- cgit v1.2.3 From 510b80a6a0f1a0d114c6e33bcea64747d127973c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Jun 2021 16:36:21 +0200 Subject: x86/pkru: Write hardware init value to PKRU when xstate is init When user space brings PKRU into init state, then the kernel handling is broken: T1 user space xsave(state) state.header.xfeatures &= ~XFEATURE_MASK_PKRU; xrstor(state) T1 -> kernel schedule() XSAVE(S) -> T1->xsave.header.xfeatures[PKRU] == 0 T1->flags |= TIF_NEED_FPU_LOAD; wrpkru(); schedule() ... pk = get_xsave_addr(&T1->fpu->state.xsave, XFEATURE_PKRU); if (pk) wrpkru(pk->pkru); else wrpkru(DEFAULT_PKRU); Because the xfeatures bit is 0 and therefore the value in the xsave storage is not valid, get_xsave_addr() returns NULL and switch_to() writes the default PKRU. -> FAIL #1! So that wrecks any copy_to/from_user() on the way back to user space which hits memory which is protected by the default PKRU value. Assumed that this does not fail (pure luck) then T1 goes back to user space and because TIF_NEED_FPU_LOAD is set it ends up in switch_fpu_return() __fpregs_load_activate() if (!fpregs_state_valid()) { load_XSTATE_from_task(); } But if nothing touched the FPU between T1 scheduling out and back in, then the fpregs_state is still valid which means switch_fpu_return() does nothing and just clears TIF_NEED_FPU_LOAD. Back to user space with DEFAULT_PKRU loaded. -> FAIL #2! The fix is simple: if get_xsave_addr() returns NULL then set the PKRU value to 0 instead of the restrictive default PKRU value in init_pkru_value. [ bp: Massage in minor nitpicks from folks. ] Fixes: 0cecca9d03c9 ("x86/fpu: Eager switch PKRU state") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Rik van Riel Tested-by: Babu Moger Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210608144346.045616965@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 18382ac1ecc4..fdee23ea4e17 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -579,9 +579,16 @@ static inline void switch_fpu_finish(struct fpu *new_fpu) * return to userland e.g. for a copy_to_user() operation. */ if (!(current->flags & PF_KTHREAD)) { + /* + * If the PKRU bit in xsave.header.xfeatures is not set, + * then the PKRU component was in init state, which means + * XRSTOR will set PKRU to 0. If the bit is not set then + * get_xsave_addr() will return NULL because the PKRU value + * in memory is not valid. This means pkru_val has to be + * set to 0 and not to init_pkru_value. + */ pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); - if (pk) - pkru_val = pk->pkru; + pkru_val = pk ? pk->pkru : 0; } __write_pkru(pkru_val); } -- cgit v1.2.3 From f72a249b0ba85564c6bfa94d609a70567485a061 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 8 Jun 2021 16:36:22 +0200 Subject: x86/fpu: Add address range checks to copy_user_to_xstate() copy_user_to_xstate() uses __copy_from_user(), which provides a negligible speedup. Fortunately, both call sites are at least almost correct. __fpu__restore_sig() checks access_ok() with xstate_sigframe_size() length and ptrace regset access uses fpu_user_xstate_size. These should be valid upper bounds on the length, so, at worst, this would cause spurious failures and not accesses to kernel memory. Nonetheless, this is far more fragile than necessary and none of these callers are in a hotpath. Use copy_from_user() instead. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Acked-by: Rik van Riel Link: https://lkml.kernel.org/r/20210608144346.140254130@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a85c64000218..8ac0f67b861a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1190,7 +1190,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) offset = offsetof(struct xregs_state, header); size = sizeof(hdr); - if (__copy_from_user(&hdr, ubuf + offset, size)) + if (copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; if (validate_user_xstate_header(&hdr)) @@ -1205,7 +1205,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) offset = xstate_offsets[i]; size = xstate_sizes[i]; - if (__copy_from_user(dst, ubuf + offset, size)) + if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } } @@ -1213,7 +1213,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { offset = offsetof(struct fxregs_state, mxcsr); size = MXCSR_AND_FLAGS_SIZE; - if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) + if (copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) return -EFAULT; } -- cgit v1.2.3 From 218bf772bddd221489c38dde6ef8e917131161f6 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Wed, 2 Jun 2021 13:52:24 -0700 Subject: kvm: LAPIC: Restore guard to prevent illegal APIC register access Per the SDM, "any access that touches bytes 4 through 15 of an APIC register may cause undefined behavior and must not be executed." Worse, such an access in kvm_lapic_reg_read can result in a leak of kernel stack contents. Prior to commit 01402cf81051 ("kvm: LAPIC: write down valid APIC registers"), such an access was explicitly disallowed. Restore the guard that was removed in that commit. Fixes: 01402cf81051 ("kvm: LAPIC: write down valid APIC registers") Signed-off-by: Jim Mattson Reported-by: syzbot Message-Id: <20210602205224.3189316-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6d72d8f43310..17fa4ab1b834 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1410,6 +1410,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); + if (alignment + len > 4) + return 1; + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) return 1; -- cgit v1.2.3 From efa165504943f2128d50f63de0c02faf6dcceb0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Jun 2021 21:18:00 +0200 Subject: x86/fpu: Reset state for all signal restore failures If access_ok() or fpregs_soft_set() fails in __fpu__restore_sig() then the function just returns but does not clear the FPU state as it does for all other fatal failures. Clear the FPU state for these failures as well. Fixes: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87mtryyhhz.ffs@nanos.tec.linutronix.de --- arch/x86/kernel/fpu/signal.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 4ab9aeb9a963..ec3ae3054792 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -307,13 +307,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; } - if (!access_ok(buf, size)) - return -EACCES; + if (!access_ok(buf, size)) { + ret = -EACCES; + goto out; + } - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + if (!static_cpu_has(X86_FEATURE_FPU)) { + ret = fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + goto out; + } if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -396,7 +400,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ ret = __copy_from_user(&env, buf, sizeof(env)); if (ret) - goto err_out; + goto out; envp = &env; } @@ -426,7 +430,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) - goto err_out; + goto out; sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, fx_only); @@ -446,7 +450,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); if (ret) { ret = -EFAULT; - goto err_out; + goto out; } sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, @@ -464,7 +468,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); if (ret) - goto err_out; + goto out; fpregs_lock(); ret = copy_kernel_to_fregs_err(&fpu->state.fsave); @@ -475,7 +479,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_deactivate(fpu); fpregs_unlock(); -err_out: +out: if (ret) fpu__clear_user_states(fpu); return ret; -- cgit v1.2.3 From a8383dfb2138742a1bb77b481ada047aededa2ba Mon Sep 17 00:00:00 2001 From: CodyYao-oc Date: Mon, 7 Jun 2021 10:53:35 +0800 Subject: x86/nmi_watchdog: Fix old-style NMI watchdog regression on old Intel CPUs The following commit: 3a4ac121c2ca ("x86/perf: Add hardware performance events support for Zhaoxin CPU.") Got the old-style NMI watchdog logic wrong and broke it for basically every Intel CPU where it was active. Which is only truly old CPUs, so few people noticed. On CPUs with perf events support we turn off the old-style NMI watchdog, so it was pretty pointless to add the logic for X86_VENDOR_ZHAOXIN to begin with ... :-/ Anyway, the fix is to restore the old logic and add a 'break'. [ mingo: Wrote a new changelog. ] Fixes: 3a4ac121c2ca ("x86/perf: Add hardware performance events support for Zhaoxin CPU.") Signed-off-by: CodyYao-oc Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210607025335.9643-1-CodyYao-oc@zhaoxin.com --- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 3ef5868ac588..7aecb2fc3186 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,7 +63,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } - fallthrough; + break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: return msr - MSR_ARCH_PERFMON_PERFCTR0; @@ -96,7 +96,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } - fallthrough; + break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: return msr - MSR_ARCH_PERFMON_EVENTSEL0; -- cgit v1.2.3 From 02ffbe6351f5c88337143bcbc649832ded7445c0 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 9 Jun 2021 20:22:17 +0800 Subject: KVM: SVM: fix doc warnings Fix kernel-doc warnings: arch/x86/kvm/svm/avic.c:233: warning: Function parameter or member 'activate' not described in 'avic_update_access_page' arch/x86/kvm/svm/avic.c:233: warning: Function parameter or member 'kvm' not described in 'avic_update_access_page' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'e' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'kvm' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'svm' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:781: warning: Function parameter or member 'vcpu_info' not described in 'get_pi_vcpu_info' arch/x86/kvm/svm/avic.c:1009: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Signed-off-by: ChenXiaoSong Message-Id: <20210609122217.2967131-1-chenxiaosong2@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0e62e6a2438c..5e7e920113f3 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -221,7 +221,7 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, return &avic_physical_id_table[index]; } -/** +/* * Note: * AVIC hardware walks the nested page table to check permissions, * but does not use the SPA address specified in the leaf page @@ -764,7 +764,7 @@ out: return ret; } -/** +/* * Note: * The HW cannot support posting multicast/broadcast * interrupts to a vCPU. So, we still use legacy interrupt @@ -1005,7 +1005,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/** +/* * This function is called during VCPU halt/unhalt. */ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) -- cgit v1.2.3 From 551912d286e940e63abe9e005f434691ee24fd15 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 28 May 2021 15:07:56 -0500 Subject: KVM: x86: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a couple of warnings by explicitly adding break statements instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Message-Id: <20210528200756.GA39320@embeddedor> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 1 + arch/x86/kvm/vmx/vmx.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9a48f138832d..b4da665bb892 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -655,6 +655,7 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) entry->ecx = F(RDPID); ++array->nent; + break; default: break; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 50b42d7a8a11..c2a779b688e6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6247,6 +6247,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) switch (kvm_get_apic_mode(vcpu)) { case LAPIC_MODE_INVALID: WARN_ONCE(true, "Invalid local APIC state"); + break; case LAPIC_MODE_DISABLED: break; case LAPIC_MODE_XAPIC: -- cgit v1.2.3 From 78fcb2c91adfec8ce3a2ba6b4d0dda89f2f4a7c6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:11 -0700 Subject: KVM: x86: Immediately reset the MMU context when the SMM flag is cleared Immediately reset the MMU context when the vCPU's SMM flag is cleared so that the SMM flag in the MMU role is always synchronized with the vCPU's flag. If RSM fails (which isn't correctly emulated), KVM will bail without calling post_leave_smm() and leave the MMU in a bad state. The bad MMU role can lead to a NULL pointer dereference when grabbing a shadow page's rmap for a page fault as the initial lookups for the gfn will happen with the vCPU's SMM flag (=0), whereas the rmap lookup will use the shadow page's SMM flag, which comes from the MMU (=1). SMM has an entirely different set of memslots, and so the initial lookup can find a memslot (SMM=0) and then explode on the rmap memslot lookup (SMM=1). general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 PID: 8410 Comm: syz-executor382 Not tainted 5.13.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__gfn_to_rmap arch/x86/kvm/mmu/mmu.c:935 [inline] RIP: 0010:gfn_to_rmap+0x2b0/0x4d0 arch/x86/kvm/mmu/mmu.c:947 Code: <42> 80 3c 20 00 74 08 4c 89 ff e8 f1 79 a9 00 4c 89 fb 4d 8b 37 44 RSP: 0018:ffffc90000ffef98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888015b9f414 RCX: ffff888019669c40 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000001 RBP: 0000000000000001 R08: ffffffff811d9cdb R09: ffffed10065a6002 R10: ffffed10065a6002 R11: 0000000000000000 R12: dffffc0000000000 R13: 0000000000000003 R14: 0000000000000001 R15: 0000000000000000 FS: 000000000124b300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000028e31000 CR4: 00000000001526e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: rmap_add arch/x86/kvm/mmu/mmu.c:965 [inline] mmu_set_spte+0x862/0xe60 arch/x86/kvm/mmu/mmu.c:2604 __direct_map arch/x86/kvm/mmu/mmu.c:2862 [inline] direct_page_fault+0x1f74/0x2b70 arch/x86/kvm/mmu/mmu.c:3769 kvm_mmu_do_page_fault arch/x86/kvm/mmu.h:124 [inline] kvm_mmu_page_fault+0x199/0x1440 arch/x86/kvm/mmu/mmu.c:5065 vmx_handle_exit+0x26/0x160 arch/x86/kvm/vmx/vmx.c:6122 vcpu_enter_guest+0x3bdd/0x9630 arch/x86/kvm/x86.c:9428 vcpu_run+0x416/0xc20 arch/x86/kvm/x86.c:9494 kvm_arch_vcpu_ioctl_run+0x4e8/0xa40 arch/x86/kvm/x86.c:9722 kvm_vcpu_ioctl+0x70f/0xbb0 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3460 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:1069 [inline] __se_sys_ioctl+0xfb/0x170 fs/ioctl.c:1055 do_syscall_64+0x3f/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x440ce9 Cc: stable@vger.kernel.org Reported-by: syzbot+fb0b6a7e8713aeb0319c@syzkaller.appspotmail.com Fixes: 9ec19493fb86 ("KVM: x86: clear SMM flags before loading state while leaving SMM") Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9dd23bdfc6cc..54d212fe9b15 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7106,7 +7106,10 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - emul_to_vcpu(ctxt)->arch.hflags = emul_flags; + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.hflags = emul_flags; + kvm_mmu_reset_context(vcpu); } static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, -- cgit v1.2.3 From 1b82435d17774f3eaab35dce239d354548aa9da2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 3 Jun 2021 01:53:40 -0400 Subject: crypto: x86/curve25519 - fix cpu feature checking logic in mod_exit In curve25519_mod_init() the curve25519_alg will be registered only when (X86_FEATURE_BMI2 && X86_FEATURE_ADX). But in curve25519_mod_exit() it still checks (X86_FEATURE_BMI2 || X86_FEATURE_ADX) when do crypto unregister. This will trigger a BUG_ON in crypto_unregister_alg() as alg->cra_refcnt is 0 if the cpu only supports one of X86_FEATURE_BMI2 and X86_FEATURE_ADX. Fixes: 07b586fe0662 ("crypto: x86/curve25519 - replace with formally verified implementation") Signed-off-by: Hangbin Liu Reviewed-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/x86/crypto/curve25519-x86_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 6706b6cb1d0f..38caf61cd5b7 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1500,7 +1500,7 @@ static int __init curve25519_mod_init(void) static void __exit curve25519_mod_exit(void) { if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && - (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX))) + static_branch_likely(&curve25519_use_bmi2_adx)) crypto_unregister_kpp(&curve25519_alg); } -- cgit v1.2.3 From 1d3156396cf6ea0873145092f4e040374ff1d862 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 9 Jun 2021 11:55:10 +0800 Subject: x86/sgx: Correct kernel-doc's arg name in sgx_encl_release() Fix the following kernel-doc warning: arch/x86/kernel/cpu/sgx/encl.c:392: warning: Function parameter \ or member 'ref' not described in 'sgx_encl_release' [ bp: Massage commit message. ] Signed-off-by: ChenXiaoSong Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210609035510.2083694-1-chenxiaosong2@huawei.com --- arch/x86/kernel/cpu/sgx/encl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 3be203297988..001808e3901c 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -383,7 +383,7 @@ const struct vm_operations_struct sgx_vm_ops = { /** * sgx_encl_release - Destroy an enclave instance - * @kref: address of a kref inside &sgx_encl + * @ref: address of a kref inside &sgx_encl * * Used together with kref_put(). Frees all the resources associated with the * enclave and the instance itself. -- cgit v1.2.3 From 934002cd660b035b926438244b4294e647507e13 Mon Sep 17 00:00:00 2001 From: Alper Gun Date: Thu, 10 Jun 2021 17:46:04 +0000 Subject: KVM: SVM: Call SEV Guest Decommission if ASID binding fails Send SEV_CMD_DECOMMISSION command to PSP firmware if ASID binding fails. If a failure happens after a successful LAUNCH_START command, a decommission command should be executed. Otherwise, guest context will be unfreed inside the AMD SP. After the firmware will not have memory to allocate more SEV guest context, LAUNCH_START command will begin to fail with SEV_RET_RESOURCE_LIMIT error. The existing code calls decommission inside sev_unbind_asid, but it is not called if a failure happens before guest activation succeeds. If sev_bind_asid fails, decommission is never called. PSP firmware has a limit for the number of guests. If sev_asid_binding fails many times, PSP firmware will not have resources to create another guest context. Cc: stable@vger.kernel.org Fixes: 59414c989220 ("KVM: SVM: Add support for KVM_SEV_LAUNCH_START command") Reported-by: Peter Gonda Signed-off-by: Alper Gun Reviewed-by: Marc Orr Signed-off-by: Paolo Bonzini Message-Id: <20210610174604.2554090-1-alpergun@google.com> --- arch/x86/kvm/svm/sev.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index e0ce5da97fc2..8d36f0c73071 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -199,9 +199,19 @@ static void sev_asid_free(struct kvm_sev_info *sev) sev->misc_cg = NULL; } -static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +static void sev_decommission(unsigned int handle) { struct sev_data_decommission decommission; + + if (!handle) + return; + + decommission.handle = handle; + sev_guest_decommission(&decommission, NULL); +} + +static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +{ struct sev_data_deactivate deactivate; if (!handle) @@ -214,9 +224,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) sev_guest_deactivate(&deactivate, NULL); up_read(&sev_deactivate_lock); - /* decommission handle */ - decommission.handle = handle; - sev_guest_decommission(&decommission, NULL); + sev_decommission(handle); } static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) @@ -341,8 +349,10 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Bind ASID to this guest */ ret = sev_bind_asid(kvm, start.handle, error); - if (ret) + if (ret) { + sev_decommission(start.handle); goto e_free_session; + } /* return handle to userspace */ params.handle = start.handle; -- cgit v1.2.3 From dfdc0a714d241bfbf951886c373cd1ae463fcc25 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 10 Jun 2021 21:59:33 -0700 Subject: KVM: X86: Fix x86_emulator slab cache leak Commit c9b8b07cded58 (KVM: x86: Dynamically allocate per-vCPU emulation context) tries to allocate per-vCPU emulation context dynamically, however, the x86_emulator slab cache is still exiting after the kvm module is unload as below after destroying the VM and unloading the kvm module. grep x86_emulator /proc/slabinfo x86_emulator 36 36 2672 12 8 : tunables 0 0 0 : slabdata 3 3 0 This patch fixes this slab cache leak by destroying the x86_emulator slab cache when the kvm module is unloaded. Fixes: c9b8b07cded58 (KVM: x86: Dynamically allocate per-vCPU emulation context) Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Message-Id: <1623387573-5969-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d212fe9b15..6d425310054b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8261,6 +8261,7 @@ void kvm_arch_exit(void) kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); free_percpu(user_return_msrs); + kmem_cache_destroy(x86_emulator_cache); kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); -- cgit v1.2.3 From 654430efde27248be563df9a88631204b5fe2df2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Jun 2021 15:00:26 -0700 Subject: KVM: x86/mmu: Calculate and check "full" mmu_role for nested MMU Calculate and check the full mmu_role when initializing the MMU context for the nested MMU, where "full" means the bits and pieces of the role that aren't handled by kvm_calc_mmu_role_common(). While the nested MMU isn't used for shadow paging, things like the number of levels in the guest's page tables are surprisingly important when walking the guest page tables. Failure to reinitialize the nested MMU context if L2's paging mode changes can result in unexpected and/or missed page faults, and likely other explosions. E.g. if an L1 vCPU is running both a 32-bit PAE L2 and a 64-bit L2, the "common" role calculation will yield the same role for both L2s. If the 64-bit L2 is run after the 32-bit PAE L2, L0 will fail to reinitialize the nested MMU context, ultimately resulting in a bad walk of L2's page tables as the MMU will still have a guest root_level of PT32E_ROOT_LEVEL. WARNING: CPU: 4 PID: 167334 at arch/x86/kvm/vmx/vmx.c:3075 ept_save_pdptrs+0x15/0xe0 [kvm_intel] Modules linked in: kvm_intel] CPU: 4 PID: 167334 Comm: CPU 3/KVM Not tainted 5.13.0-rc1-d849817d5673-reqs #185 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:ept_save_pdptrs+0x15/0xe0 [kvm_intel] Code: <0f> 0b c3 f6 87 d8 02 00f RSP: 0018:ffffbba702dbba00 EFLAGS: 00010202 RAX: 0000000000000011 RBX: 0000000000000002 RCX: ffffffff810a2c08 RDX: ffff91d7bc30acc0 RSI: 0000000000000011 RDI: ffff91d7bc30a600 RBP: ffff91d7bc30a600 R08: 0000000000000010 R09: 0000000000000007 R10: 0000000000000000 R11: 0000000000000000 R12: ffff91d7bc30a600 R13: ffff91d7bc30acc0 R14: ffff91d67c123460 R15: 0000000115d7e005 FS: 00007fe8e9ffb700(0000) GS:ffff91d90fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000029f15a001 CR4: 00000000001726e0 Call Trace: kvm_pdptr_read+0x3a/0x40 [kvm] paging64_walk_addr_generic+0x327/0x6a0 [kvm] paging64_gva_to_gpa_nested+0x3f/0xb0 [kvm] kvm_fetch_guest_virt+0x4c/0xb0 [kvm] __do_insn_fetch_bytes+0x11a/0x1f0 [kvm] x86_decode_insn+0x787/0x1490 [kvm] x86_decode_emulated_instruction+0x58/0x1e0 [kvm] x86_emulate_instruction+0x122/0x4f0 [kvm] vmx_handle_exit+0x120/0x660 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xe25/0x1cb0 [kvm] kvm_vcpu_ioctl+0x211/0x5a0 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x40/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae Cc: Vitaly Kuznetsov Cc: stable@vger.kernel.org Fixes: bf627a928837 ("x86/kvm/mmu: check if MMU reconfiguration is needed in init_kvm_nested_mmu()") Signed-off-by: Sean Christopherson Message-Id: <20210610220026.1364486-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0144c40d09c7..8d5876dfc6b7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4739,9 +4739,33 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) context->inject_page_fault = kvm_inject_page_fault; } +static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false); + + /* + * Nested MMUs are used only for walking L2's gva->gpa, they never have + * shadow pages of their own and so "direct" has no meaning. Set it + * to "true" to try to detect bogus usage of the nested MMU. + */ + role.base.direct = true; + + if (!is_paging(vcpu)) + role.base.level = 0; + else if (is_long_mode(vcpu)) + role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (is_pae(vcpu)) + role.base.level = PT32E_ROOT_LEVEL; + else + role.base.level = PT32_ROOT_LEVEL; + + return role; +} + static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { - union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false); + union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_role.as_u64 == g_context->mmu_role.as_u64) -- cgit v1.2.3 From 2398ce80152aae33b9501ef54452e09e8e8d4262 Mon Sep 17 00:00:00 2001 From: Tor Vic Date: Thu, 10 Jun 2021 20:58:06 +0000 Subject: x86, lto: Pass -stack-alignment only on LLD < 13.0.0 Since LLVM commit 3787ee4, the '-stack-alignment' flag has been dropped [1], leading to the following error message when building a LTO kernel with Clang-13 and LLD-13: ld.lld: error: -plugin-opt=-: ld.lld: Unknown command line argument '-stack-alignment=8'. Try 'ld.lld --help' ld.lld: Did you mean '--stackrealign=8'? It also appears that the '-code-model' flag is not necessary anymore starting with LLVM-9 [2]. Drop '-code-model' and make '-stack-alignment' conditional on LLD < 13.0.0. These flags were necessary because these flags were not encoded in the IR properly, so the link would restart optimizations without them. Now there are properly encoded in the IR, and these flags exposing implementation details are no longer necessary. [1] https://reviews.llvm.org/D103048 [2] https://reviews.llvm.org/D52322 Cc: stable@vger.kernel.org Link: https://github.com/ClangBuiltLinux/linux/issues/1377 Signed-off-by: Tor Vic Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/f2c018ee-5999-741e-58d4-e482d5246067@mailbox.org --- arch/x86/Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 307529417021..cb5e8d39cac1 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -200,8 +200,9 @@ endif KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE) ifdef CONFIG_LTO_CLANG -KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \ - -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0) +KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8) +endif endif ifdef CONFIG_X86_NEED_RELOCS -- cgit v1.2.3 From 583bfd484bcc85e9371e7205fa9e827c18ae34fb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 29 Apr 2021 16:26:12 -0700 Subject: x86, lto: Enable Clang LTO for 32-bit as well Commit b33fff07e3e3 ("x86, build: allow LTO to be selected") enabled support for LTO for x86_64 but 32-bit works fine as well. I tested the following config combinations: * i386_defconfig + CONFIG_LTO_CLANG_FULL=y * i386_defconfig + CONFIG_LTO_CLANG_THIN=y * ARCH=i386 allmodconfig + CONFIG_LTO_CLANG_THIN=y with LLVM 11.1.0, 12.0.0, and 13.0.0 from git without any build failures. The defconfigs boot in QEMU with no new warnings. Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210429232611.3966964-1-nathan@kernel.org --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..12fa0d7bfa64 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -103,8 +103,8 @@ config X86 select ARCH_SUPPORTS_DEBUG_PAGEALLOC select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_LTO_CLANG if X86_64 - select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS -- cgit v1.2.3 From 4aca2d99fd27698cf82d55aed4859fde859082ac Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 19 May 2021 15:52:47 +0200 Subject: x86/sev: Fix error message in runtime #VC handler The runtime #VC handler is not "early" anymore. Fix the copy&paste error and remove that word from the error message. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210614135327.9921-2-joro@8bytes.org --- arch/x86/kernel/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 651b81cd648e..4fd997bbf059 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1369,7 +1369,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) vc_finish_insn(&ctxt); break; case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", + pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", error_code, regs->ip); goto fail; case ES_VMM_ERROR: -- cgit v1.2.3 From f2df15639e44d23bf82a86a03092472c7278cd39 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 19 May 2021 15:52:49 +0200 Subject: x86/insn-eval: Make 0 a valid RIP for insn_get_effective_ip() In theory, 0 is a valid value for the instruction pointer so don't use it as the error return value from insn_get_effective_ip(). Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210614135327.9921-5-joro@8bytes.org --- arch/x86/lib/insn-eval.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index a67afd74232c..4eecb9c7c6a0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1417,7 +1417,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs) } } -static unsigned long insn_get_effective_ip(struct pt_regs *regs) +static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip) { unsigned long seg_base = 0; @@ -1430,10 +1430,12 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs) if (!user_64bit_mode(regs)) { seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS); if (seg_base == -1L) - return 0; + return -EINVAL; } - return seg_base + regs->ip; + *ip = seg_base + regs->ip; + + return 0; } /** @@ -1455,8 +1457,7 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) unsigned long ip; int not_copied; - ip = insn_get_effective_ip(regs); - if (!ip) + if (insn_get_effective_ip(regs, &ip)) return 0; not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); @@ -1484,8 +1485,7 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN unsigned long ip; int not_copied; - ip = insn_get_effective_ip(regs); - if (!ip) + if (insn_get_effective_ip(regs, &ip)) return 0; not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); -- cgit v1.2.3 From 4aaa7eacd7cc7c10f269c7f2a01d044b375bed8e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jun 2021 15:53:26 +0200 Subject: x86/insn: Extend error reporting from insn_fetch_from_user[_inatomic]() The error reporting from the insn_fetch_from_user*() functions is not very verbose. Extend it to include information on whether the linear RIP could not be calculated or whether the memory access faulted. This will be used in the SEV-ES code to propagate the correct exception depending on what went wrong during instruction fetch. [ bp: Massage comments. ] Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210614135327.9921-6-joro@8bytes.org --- arch/x86/kernel/sev.c | 8 ++++---- arch/x86/kernel/umip.c | 10 ++++------ arch/x86/lib/insn-eval.c | 16 ++++++++-------- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 4fd997bbf059..a1eeaa79bf2b 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -258,17 +258,17 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) { char buffer[MAX_INSN_SIZE]; - int res; + int insn_bytes; - res = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (!res) { + insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); + if (insn_bytes <= 0) { ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; ctxt->fi.cr2 = ctxt->regs->ip; return ES_EXCEPTION; } - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res)) + if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) return ES_DECODE_FAILED; if (ctxt->insn.immediate.got) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 8daa70b0d2da..576b47e7523d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -346,14 +346,12 @@ bool fixup_umip_exception(struct pt_regs *regs) if (!regs) return false; - nr_copied = insn_fetch_from_user(regs, buf); - /* - * The insn_fetch_from_user above could have failed if user code - * is protected by a memory protection key. Give up on emulation - * in such a case. Should we issue a page fault? + * Give up on emulation if fetching the instruction failed. Should a + * page fault or a #GP be issued? */ - if (!nr_copied) + nr_copied = insn_fetch_from_user(regs, buf); + if (nr_copied <= 0) return false; if (!insn_decode_from_regs(&insn, regs, buf, nr_copied)) diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 4eecb9c7c6a0..a1d24fdc07cf 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -1448,9 +1448,9 @@ static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip) * * Returns: * - * Number of instruction bytes copied. - * - * 0 if nothing was copied. + * - number of instruction bytes copied. + * - 0 if nothing was copied. + * - -EINVAL if the linear address of the instruction could not be calculated */ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { @@ -1458,7 +1458,7 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) int not_copied; if (insn_get_effective_ip(regs, &ip)) - return 0; + return -EINVAL; not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE); @@ -1476,9 +1476,9 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) * * Returns: * - * Number of instruction bytes copied. - * - * 0 if nothing was copied. + * - number of instruction bytes copied. + * - 0 if nothing was copied. + * - -EINVAL if the linear address of the instruction could not be calculated. */ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE]) { @@ -1486,7 +1486,7 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN int not_copied; if (insn_get_effective_ip(regs, &ip)) - return 0; + return -EINVAL; not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE); -- cgit v1.2.3 From 07570cef5e5c3fcec40f82a9075abb4c1da63319 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jun 2021 15:53:27 +0200 Subject: x86/sev: Propagate #GP if getting linear instruction address failed When an instruction is fetched from user-space, segmentation needs to be taken into account. This means that getting the linear address of an instruction can fail. Hardware would raise a #GP exception in that case, but the #VC exception handler would emulate it as a page-fault. The insn_fetch_from_user*() functions now provide the relevant information in case of a failure. Use that and propagate a #GP when the linear address of an instruction to fetch could not be calculated. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210614135327.9921-7-joro@8bytes.org --- arch/x86/kernel/sev.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index a1eeaa79bf2b..8178db07a06a 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -261,11 +261,18 @@ static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) int insn_bytes; insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (insn_bytes <= 0) { + if (insn_bytes == 0) { + /* Nothing could be copied */ ctxt->fi.vector = X86_TRAP_PF; ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; ctxt->fi.cr2 = ctxt->regs->ip; return ES_EXCEPTION; + } else if (insn_bytes == -EINVAL) { + /* Effective RIP could not be calculated */ + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + ctxt->fi.cr2 = 0; + return ES_EXCEPTION; } if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) -- cgit v1.2.3 From 1348924ba8169f35cedfd0a0087872b81a632b8e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 14 Jun 2021 14:12:22 -0700 Subject: x86/msr: Define new bits in TSX_FORCE_ABORT MSR Intel client processors that support the IA32_TSX_FORCE_ABORT MSR related to perf counter interaction [1] received a microcode update that deprecates the Transactional Synchronization Extension (TSX) feature. The bit FORCE_ABORT_RTM now defaults to 1, writes to this bit are ignored. A new bit TSX_CPUID_CLEAR clears the TSX related CPUID bits. The summary of changes to the IA32_TSX_FORCE_ABORT MSR are: Bit 0: FORCE_ABORT_RTM (legacy bit, new default=1) Status bit that indicates if RTM transactions are always aborted. This bit is essentially !SDV_ENABLE_RTM(Bit 2). Writes to this bit are ignored. Bit 1: TSX_CPUID_CLEAR (new bit, default=0) When set, CPUID.HLE = 0 and CPUID.RTM = 0. Bit 2: SDV_ENABLE_RTM (new bit, default=0) When clear, XBEGIN will always abort with EAX code 0. When set, XBEGIN will not be forced to abort (but will always abort in SGX enclaves). This bit is intended to be used on developer systems. If this bit is set, transactional atomicity correctness is not certain. SDV = Software Development Vehicle (SDV), i.e. developer systems. Performance monitoring counter 3 is usable in all cases, regardless of the value of above bits. Add support for a new CPUID bit - CPUID.RTM_ALWAYS_ABORT (CPUID 7.EDX[11]) - to indicate the status of always abort behavior. [1] [ bp: Look for document ID 604224, "Performance Monitoring Impact of Intel Transactional Synchronization Extension Memory". Since there's no way for us to have stable links to documents... ] [ bp: Massage and extend commit message. ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Tested-by: Neelima Krishnan Link: https://lkml.kernel.org/r/9add61915b4a4eedad74fbd869107863a28b428e.1623704845.git-series.pawan.kumar.gupta@linux.intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 81269c73a0dc..d0ce5cfd3ac1 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -378,6 +378,7 @@ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ +#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ #define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 742d89a00721..2bc1600d3604 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -772,6 +772,10 @@ #define MSR_TFA_RTM_FORCE_ABORT_BIT 0 #define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT) +#define MSR_TFA_TSX_CPUID_CLEAR_BIT 1 +#define MSR_TFA_TSX_CPUID_CLEAR BIT_ULL(MSR_TFA_TSX_CPUID_CLEAR_BIT) +#define MSR_TFA_SDV_ENABLE_RTM_BIT 2 +#define MSR_TFA_SDV_ENABLE_RTM BIT_ULL(MSR_TFA_SDV_ENABLE_RTM_BIT) /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 -- cgit v1.2.3 From ad3c2e174938d72fded674acead42e2464a3b460 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 14 Jun 2021 14:13:23 -0700 Subject: x86/events/intel: Do not deploy TSX force abort workaround when TSX is deprecated Earlier workaround added by 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") for perf counter interactions [1] are not required on some client systems which received a microcode update that deprecates TSX. Bypass the perf workaround when such microcode is enumerated. [1] [ bp: Look for document ID 604224, "Performance Monitoring Impact of Intel Transactional Synchronization Extension Memory". Since there's no way for us to have stable links to documents... ] [ bp: Massage comment. ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Tested-by: Neelima Krishnan Link: https://lkml.kernel.org/r/e4d410f786946280ced02dd07c74e0a74f1d10cb.1623704845.git-series.pawan.kumar.gupta@linux.intel.com --- arch/x86/events/intel/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2521d03de5e0..062bf8968c0e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6015,7 +6015,13 @@ __init int intel_pmu_init(void) tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl(pmem); - if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + /* + * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default. + * TSX force abort hooks are not required on these systems. Only deploy + * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT. + */ + if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) && + !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { x86_pmu.flags |= PMU_FL_TFA; x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; -- cgit v1.2.3 From 293649307ef9abcd4f83f6dac4d4400dfd97c936 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 14 Jun 2021 14:14:25 -0700 Subject: x86/tsx: Clear CPUID bits when TSX always force aborts As a result of TSX deprecation, some processors always abort TSX transactions by default after a microcode update. When TSX feature cannot be used it is better to hide it. Clear CPUID.RTM and CPUID.HLE bits when TSX transactions always abort. [ bp: Massage commit message and comments. ] Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Tested-by: Neelima Krishnan Link: https://lkml.kernel.org/r/5209b3d72ffe5bd3cafdcc803f5b883f785329c3.1623704845.git-series.pawan.kumar.gupta@linux.intel.com --- arch/x86/kernel/cpu/cpu.h | 2 ++ arch/x86/kernel/cpu/intel.c | 4 +++- arch/x86/kernel/cpu/tsx.c | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 67944128876d..95521302630d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -48,6 +48,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], enum tsx_ctrl_states { TSX_CTRL_ENABLE, TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, TSX_CTRL_NOT_SUPPORTED, }; @@ -56,6 +57,7 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); extern void tsx_enable(void); extern void tsx_disable(void); +extern void tsx_clear_cpuid(void); #else static inline void tsx_init(void) { } #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8adffc17fa8b..861e919eba9a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -717,8 +717,10 @@ static void init_intel(struct cpuinfo_x86 *c) if (tsx_ctrl_state == TSX_CTRL_ENABLE) tsx_enable(); - if (tsx_ctrl_state == TSX_CTRL_DISABLE) + else if (tsx_ctrl_state == TSX_CTRL_DISABLE) tsx_disable(); + else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT) + tsx_clear_cpuid(); split_lock_init(); bus_lock_init(); diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index e2ad30e474f8..9c7a5f049292 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -2,7 +2,7 @@ /* * Intel Transactional Synchronization Extensions (TSX) control. * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2021 Intel Corporation * * Author: * Pawan Gupta @@ -84,13 +84,46 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) return TSX_CTRL_ENABLE; } +void tsx_clear_cpuid(void) +{ + u64 msr; + + /* + * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID + * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + rdmsrl(MSR_TSX_FORCE_ABORT, msr); + msr |= MSR_TFA_TSX_CPUID_CLEAR; + wrmsrl(MSR_TSX_FORCE_ABORT, msr); + } +} + void __init tsx_init(void) { char arg[5] = {}; int ret; - if (!tsx_ctrl_is_supported()) + /* + * Hardware will always abort a TSX transaction if both CPUID bits + * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is + * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them + * here. + */ + if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && + boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT; + tsx_clear_cpuid(); + setup_clear_cpu_cap(X86_FEATURE_RTM); + setup_clear_cpu_cap(X86_FEATURE_HLE); return; + } + + if (!tsx_ctrl_is_supported()) { + tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED; + return; + } ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); if (ret >= 0) { -- cgit v1.2.3 From 4692bc775d2180a937335ccba0edce557103d44a Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 22:16:39 +1200 Subject: x86/sgx: Add missing xa_destroy() when virtual EPC is destroyed xa_destroy() needs to be called to destroy a virtual EPC's page array before calling kfree() to free the virtual EPC. Currently it is not called so add the missing xa_destroy(). Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests") Signed-off-by: Kai Huang Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Tested-by: Yang Zhong Link: https://lkml.kernel.org/r/20210615101639.291929-1-kai.huang@intel.com --- arch/x86/kernel/cpu/sgx/virt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 6ad165a5c0cc..64511c4a5200 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -212,6 +212,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_splice_tail(&secs_pages, &zombie_secs_pages); mutex_unlock(&zombie_secs_pages_lock); + xa_destroy(&vepc->page_array); kfree(vepc); return 0; -- cgit v1.2.3 From a9d6496d667fdb86713868a402378a0e4db62b50 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Thu, 27 May 2021 15:57:51 +0800 Subject: KVM: x86/mmu: Make is_nx_huge_page_enabled an inline function Function 'is_nx_huge_page_enabled' is called only by kvm/mmu, so make it as inline fucntion and remove the unnecessary declaration. Cc: Ben Gardon Cc: Paolo Bonzini Cc: Sean Christopherson Suggested-by: Sean Christopherson Signed-off-by: Shaokun Zhang Message-Id: <1622102271-63107-1-git-send-email-zhangshaokun@hisilicon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 +------ arch/x86/kvm/mmu/mmu_internal.h | 9 ++++++--- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8d5876dfc6b7..8ac1b9c935fe 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -55,7 +55,7 @@ extern bool itlb_multihit_kvm_mitigation; -static int __read_mostly nx_huge_pages = -1; +int __read_mostly nx_huge_pages = -1; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -208,11 +208,6 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, kvm_flush_remote_tlbs_with_range(kvm, &range); } -bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d64ccb417c60..ff4c6256f3f9 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -116,7 +116,12 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -bool is_nx_huge_page_enabled(void); +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} + bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); @@ -158,8 +163,6 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, kvm_pfn_t *pfnp, int *goal_levelp); -bool is_nx_huge_page_enabled(void); - void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp); -- cgit v1.2.3 From 43e5146436099a98fcd30793598d61e582ec6830 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Wed, 26 May 2021 10:56:08 +0200 Subject: KVM: x86: Move FPU register accessors into fpu.h Hyper-v XMM fast hypercalls use XMM registers to pass input/output parameters. To access these, hyperv.c can reuse some FPU register accessors defined in emulator.c. Move them to a common location so both can access them. While at it, reorder the parameters of these accessor methods to make them more readable. Cc: Alexander Graf Cc: Evgeny Iakovlev Signed-off-by: Siddharth Chandrasekaran Message-Id: <01a85a6560714d4d3637d3d86e5eba65073318fa.1622019133.git.sidcha@amazon.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 137 ++++++-------------------------------------- arch/x86/kvm/fpu.h | 140 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/kvm_emulate.h | 3 +- 3 files changed, 158 insertions(+), 122 deletions(-) create mode 100644 arch/x86/kvm/fpu.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5e5de05a8fbf..10e16a70b361 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,7 +22,6 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" #include -#include #include #include @@ -1081,116 +1080,14 @@ static void fetch_register_operand(struct operand *op) } } -static void emulator_get_fpu(void) -{ - fpregs_lock(); - - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); -} - -static void emulator_put_fpu(void) -{ - fpregs_unlock(); -} - -static void read_sse_reg(sse128_t *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; - case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; - case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; - case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; - case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; - case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; - case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; - case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; - case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; - case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; - case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; - case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; - case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; - case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; - case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; -#endif - default: BUG(); - } - emulator_put_fpu(); -} - -static void write_sse_reg(sse128_t *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; - case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; - case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; - case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; - case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; - case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; - case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; - case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; -#ifdef CONFIG_X86_64 - case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; - case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; - case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; - case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; - case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; - case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; - case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; - case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; -#endif - default: BUG(); - } - emulator_put_fpu(); -} - -static void read_mmx_reg(u64 *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; - case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; - case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; - case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; - case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; - case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; - case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; - case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; - default: BUG(); - } - emulator_put_fpu(); -} - -static void write_mmx_reg(u64 *data, int reg) -{ - emulator_get_fpu(); - switch (reg) { - case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; - case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; - case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; - case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; - case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; - case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; - case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; - case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; - default: BUG(); - } - emulator_put_fpu(); -} - static int em_fninit(struct x86_emulate_ctxt *ctxt) { if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fninit"); - emulator_put_fpu(); + kvm_fpu_put(); return X86EMUL_CONTINUE; } @@ -1201,9 +1098,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fnstcw %0": "+m"(fcw)); - emulator_put_fpu(); + kvm_fpu_put(); ctxt->dst.val = fcw; @@ -1217,9 +1114,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) return emulate_nm(ctxt); - emulator_get_fpu(); + kvm_fpu_get(); asm volatile("fnstsw %0": "+m"(fsw)); - emulator_put_fpu(); + kvm_fpu_put(); ctxt->dst.val = fsw; @@ -1238,7 +1135,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; - read_sse_reg(&op->vec_val, reg); + kvm_read_sse_reg(reg, &op->vec_val); return; } if (ctxt->d & Mmx) { @@ -1289,7 +1186,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->type = OP_XMM; op->bytes = 16; op->addr.xmm = ctxt->modrm_rm; - read_sse_reg(&op->vec_val, ctxt->modrm_rm); + kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); return rc; } if (ctxt->d & Mmx) { @@ -1866,10 +1763,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->bytes * op->count); break; case OP_XMM: - write_sse_reg(&op->vec_val, op->addr.xmm); + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); break; case OP_MM: - write_mmx_reg(&op->mm_val, op->addr.mm); + kvm_write_mmx_reg(op->addr.mm, &op->mm_val); break; case OP_NONE: /* no writeback */ @@ -4124,11 +4021,11 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - emulator_get_fpu(); + kvm_fpu_get(); rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); - emulator_put_fpu(); + kvm_fpu_put(); if (rc != X86EMUL_CONTINUE) return rc; @@ -4172,7 +4069,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - emulator_get_fpu(); + kvm_fpu_get(); if (size < __fxstate_size(16)) { rc = fxregs_fixup(&fx_state, size); @@ -4189,7 +4086,7 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt) rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); out: - emulator_put_fpu(); + kvm_fpu_put(); return rc; } @@ -5437,9 +5334,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { int rc; - emulator_get_fpu(); + kvm_fpu_get(); rc = asm_safe("fwait"); - emulator_put_fpu(); + kvm_fpu_put(); if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); @@ -5450,7 +5347,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) static void fetch_possible_mmx_operand(struct operand *op) { if (op->type == OP_MM) - read_mmx_reg(&op->mm_val, op->addr.mm); + kvm_read_mmx_reg(op->addr.mm, &op->mm_val); } static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop) diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h new file mode 100644 index 000000000000..3ba12888bf66 --- /dev/null +++ b/arch/x86/kvm/fpu.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KVM_FPU_H_ +#define __KVM_FPU_H_ + +#include + +typedef u32 __attribute__((vector_size(16))) sse128_t; +#define __sse128_u union { sse128_t vec; u64 as_u64[2]; u32 as_u32[4]; } +#define sse128_lo(x) ({ __sse128_u t; t.vec = x; t.as_u64[0]; }) +#define sse128_hi(x) ({ __sse128_u t; t.vec = x; t.as_u64[1]; }) +#define sse128_l0(x) ({ __sse128_u t; t.vec = x; t.as_u32[0]; }) +#define sse128_l1(x) ({ __sse128_u t; t.vec = x; t.as_u32[1]; }) +#define sse128_l2(x) ({ __sse128_u t; t.vec = x; t.as_u32[2]; }) +#define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) +#define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) + +static inline void _kvm_read_sse_reg(int reg, sse128_t *data) +{ + switch (reg) { + case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqa %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqa %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqa %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqa %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqa %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqa %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqa %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqa %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqa %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqa %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqa %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqa %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqa %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqa %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_write_sse_reg(int reg, const sse128_t *data) +{ + switch (reg) { + case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqa %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqa %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqa %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqa %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqa %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqa %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqa %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqa %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqa %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqa %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqa %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqa %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqa %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqa %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_read_mmx_reg(int reg, u64 *data) +{ + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } +} + +static inline void _kvm_write_mmx_reg(int reg, const u64 *data) +{ + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } +} + +static inline void kvm_fpu_get(void) +{ + fpregs_lock(); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); +} + +static inline void kvm_fpu_put(void) +{ + fpregs_unlock(); +} + +static inline void kvm_read_sse_reg(int reg, sse128_t *data) +{ + kvm_fpu_get(); + _kvm_read_sse_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_sse_reg(int reg, const sse128_t *data) +{ + kvm_fpu_get(); + _kvm_write_sse_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_read_mmx_reg(int reg, u64 *data) +{ + kvm_fpu_get(); + _kvm_read_mmx_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_mmx_reg(int reg, const u64 *data) +{ + kvm_fpu_get(); + _kvm_write_mmx_reg(reg, data); + kvm_fpu_put(); +} + +#endif diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 3e870bf9ca4d..b063d376b7d9 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -13,6 +13,7 @@ #define _ASM_X86_KVM_X86_EMULATE_H #include +#include "fpu.h" struct x86_emulate_ctxt; enum x86_intercept; @@ -236,8 +237,6 @@ struct x86_emulate_ops { int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; -typedef u32 __attribute__((vector_size(16))) sse128_t; - /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; -- cgit v1.2.3 From bd38b32053eb1c53ddb7030cf0fc6d700f7f1d82 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Wed, 26 May 2021 10:56:09 +0200 Subject: KVM: hyper-v: Collect hypercall params into struct As of now there are 7 parameters (and flags) that are used in various hyper-v hypercall handlers. There are 6 more input/output parameters passed from XMM registers which are to be added in an upcoming patch. To make passing arguments to the handlers more readable, capture all these parameters into a single structure. Cc: Alexander Graf Cc: Evgeny Iakovlev Signed-off-by: Siddharth Chandrasekaran Message-Id: <273f7ed510a1f6ba177e61b73a5c7bfbee4a4a87.1622019133.git.sidcha@amazon.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 147 +++++++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f00830e5202f..7c7a2da591da 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1631,7 +1631,18 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask( return vcpu_bitmap; } -static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex) +struct kvm_hv_hcall { + u64 param; + u64 ingpa; + u64 outgpa; + u16 code; + u16 rep_cnt; + u16 rep_idx; + bool fast; + bool rep; +}; + +static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); @@ -1646,7 +1657,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool bool all_cpus; if (!ex) { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush, sizeof(flush)))) + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; trace_kvm_hv_flush_tlb(flush.processor_mask, @@ -1665,7 +1676,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1687,8 +1698,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool if (!all_cpus && kvm_read_guest(kvm, - ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents), + hc->ingpa + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents), sparse_banks, sparse_banks_len)) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1708,9 +1719,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: - /* We always do full TLB flush, set rep_done = rep_cnt. */ + /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | - ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); + ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, @@ -1732,8 +1743,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, } } -static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, - bool ex, bool fast) +static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; @@ -1748,25 +1758,25 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, bool all_cpus; if (!ex) { - if (!fast) { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + if (!hc->fast) { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ - if (unlikely(ingpa >> 32 != 0)) + if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; - sparse_banks[0] = outgpa; - vector = (u32)ingpa; + sparse_banks[0] = hc->outgpa; + vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { - if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1786,8 +1796,8 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa, if (!all_cpus && kvm_read_guest(kvm, - ingpa + offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents), + hc->ingpa + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents), sparse_banks, sparse_banks_len)) return HV_STATUS_INVALID_HYPERCALL_INPUT; @@ -1847,20 +1857,21 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } -static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) +static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; - if (unlikely(!fast)) { + if (unlikely(!hc->fast)) { int ret; - gpa_t gpa = param; + gpa_t gpa = hc->ingpa; - if ((gpa & (__alignof__(param) - 1)) || - offset_in_page(gpa) + sizeof(param) > PAGE_SIZE) + if ((gpa & (__alignof__(hc->ingpa) - 1)) || + offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; - ret = kvm_vcpu_read_guest(vcpu, gpa, ¶m, sizeof(param)); + ret = kvm_vcpu_read_guest(vcpu, gpa, + &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } @@ -1870,15 +1881,15 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ - if (param & 0xffff00000000ULL) + if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ - if (param & ~KVM_HYPERV_CONN_ID_MASK) + if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); - eventfd = idr_find(&hv->conn_to_evt, param); + eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; @@ -1889,9 +1900,8 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param) int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { - u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; - uint16_t code, rep_idx, rep_cnt; - bool fast, rep; + struct kvm_hv_hcall hc; + u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode @@ -1904,104 +1914,105 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (is_64_bit_mode(vcpu)) { - param = kvm_rcx_read(vcpu); - ingpa = kvm_rdx_read(vcpu); - outgpa = kvm_r8_read(vcpu); + hc.param = kvm_rcx_read(vcpu); + hc.ingpa = kvm_rdx_read(vcpu); + hc.outgpa = kvm_r8_read(vcpu); } else #endif { - param = ((u64)kvm_rdx_read(vcpu) << 32) | - (kvm_rax_read(vcpu) & 0xffffffff); - ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | - (kvm_rcx_read(vcpu) & 0xffffffff); - outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | - (kvm_rsi_read(vcpu) & 0xffffffff); + hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | + (kvm_rax_read(vcpu) & 0xffffffff); + hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | + (kvm_rcx_read(vcpu) & 0xffffffff); + hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | + (kvm_rsi_read(vcpu) & 0xffffffff); } - code = param & 0xffff; - fast = !!(param & HV_HYPERCALL_FAST_BIT); - rep_cnt = (param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; - rep_idx = (param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; - rep = !!(rep_cnt || rep_idx); + hc.code = hc.param & 0xffff; + hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); + hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; + hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; + hc.rep = !!(hc.rep_cnt || hc.rep_idx); - trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, + hc.ingpa, hc.outgpa); - switch (code) { + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hvcall_signal_event(vcpu, fast, ingpa); + ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ - if (unlikely(rep || !to_hv_synic(vcpu)->active)) { + if (unlikely(hc.rep || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(fast || !rep_cnt || rep_idx)) { + if (unlikely(hc.fast || !hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(fast || rep)) { + if (unlikely(hc.fast || hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, false); + ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: - if (unlikely(fast || !rep_cnt || rep_idx)) { + if (unlikely(hc.fast || !hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.fast || hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); + ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_SEND_IPI: - if (unlikely(rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); + ret = kvm_hv_send_ipi(vcpu, &hc, false); break; case HVCALL_SEND_IPI_EX: - if (unlikely(fast || rep)) { + if (unlikely(hc.fast || hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } - ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + ret = kvm_hv_send_ipi(vcpu, &hc, true); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: - if (unlikely(fast)) { + if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } @@ -2020,9 +2031,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; - vcpu->run->hyperv.u.hcall.input = param; - vcpu->run->hyperv.u.hcall.params[0] = ingpa; - vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->run->hyperv.u.hcall.input = hc.param; + vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; + vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; -- cgit v1.2.3 From 5974565bc26d6a599189db7c0b1f79eaa9af8eb9 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Wed, 26 May 2021 10:56:10 +0200 Subject: KVM: x86: kvm_hv_flush_tlb use inputs from XMM registers Hyper-V supports the use of XMM registers to perform fast hypercalls. This allows guests to take advantage of the improved performance of the fast hypercall interface even though a hypercall may require more than (the current maximum of) two input registers. The XMM fast hypercall interface uses six additional XMM registers (XMM0 to XMM5) to allow the guest to pass an input parameter block of up to 112 bytes. Add framework to read from XMM registers in kvm_hv_hypercall() and use the additional hypercall inputs from XMM registers in kvm_hv_flush_tlb() when possible. Cc: Alexander Graf Co-developed-by: Evgeny Iakovlev Signed-off-by: Evgeny Iakovlev Signed-off-by: Siddharth Chandrasekaran Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 3 ++ arch/x86/kvm/hyperv.c | 90 ++++++++++++++++++++++++++++++-------- 2 files changed, 74 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 606f5cc579b2..27a9f08e8386 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -314,6 +314,9 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 +/* Number of XMM registers used in hypercall input/output */ +#define HV_HYPERCALL_MAX_XMM_REGISTERS 6 + struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7c7a2da591da..449589e283d6 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -36,6 +36,7 @@ #include "trace.h" #include "irq.h" +#include "fpu.h" /* "Hv#1" signature */ #define HYPERV_CPUID_SIGNATURE_EAX 0x31237648 @@ -1640,10 +1641,13 @@ struct kvm_hv_hcall { u16 rep_idx; bool fast; bool rep; + sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; }; static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool ex) { + int i; + gpa_t gpa; struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; @@ -1657,8 +1661,15 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool bool all_cpus; if (!ex) { - if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + flush.address_space = hc->ingpa; + flush.flags = hc->outgpa; + flush.processor_mask = sse128_lo(hc->xmm[0]); + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, + &flush, sizeof(flush)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags); @@ -1676,9 +1687,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { - if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, - sizeof(flush_ex)))) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->fast) { + flush_ex.address_space = hc->ingpa; + flush_ex.flags = hc->outgpa; + memcpy(&flush_ex.hv_vp_set, + &hc->xmm[0], sizeof(hc->xmm[0])); + } else { + if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, + sizeof(flush_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, @@ -1689,20 +1707,28 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = - bitmap_weight((unsigned long *)&valid_bank_mask, 64) * - sizeof(sparse_banks[0]); + sparse_banks_len = bitmap_weight((unsigned long *)&valid_bank_mask, 64); if (!sparse_banks_len && !all_cpus) goto ret_success; - if (!all_cpus && - kvm_read_guest(kvm, - hc->ingpa + offsetof(struct hv_tlb_flush_ex, - hv_vp_set.bank_contents), - sparse_banks, - sparse_banks_len)) - return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (!all_cpus) { + if (hc->fast) { + if (sparse_banks_len > HV_HYPERCALL_MAX_XMM_REGISTERS - 1) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + for (i = 0; i < sparse_banks_len; i += 2) { + sparse_banks[i] = sse128_lo(hc->xmm[i / 2 + 1]); + sparse_banks[i + 1] = sse128_hi(hc->xmm[i / 2 + 1]); + } + } else { + gpa = hc->ingpa + offsetof(struct hv_tlb_flush_ex, + hv_vp_set.bank_contents); + if (unlikely(kvm_read_guest(kvm, gpa, sparse_banks, + sparse_banks_len * + sizeof(sparse_banks[0])))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + } } cpumask_clear(&hv_vcpu->tlb_flush); @@ -1898,6 +1924,29 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h return HV_STATUS_SUCCESS; } +static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) +{ + switch (hc->code) { + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + return true; + } + + return false; +} + +static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) +{ + int reg; + + kvm_fpu_get(); + for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) + _kvm_read_sse_reg(reg, &hc->xmm[reg]); + kvm_fpu_put(); +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_hv_hcall hc; @@ -1934,6 +1983,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); + if (hc.fast && is_xmm_fast_hypercall(&hc)) + kvm_hv_hypercall_read_xmm(&hc); + trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); @@ -1969,28 +2021,28 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_complete_userspace; return 0; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: - if (unlikely(hc.fast || !hc.rep_cnt || hc.rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: - if (unlikely(hc.fast || hc.rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc, false); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: - if (unlikely(hc.fast || !hc.rep_cnt || hc.rep_idx)) { + if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc, true); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: - if (unlikely(hc.fast || hc.rep)) { + if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } -- cgit v1.2.3 From d8f5537a8816c8f00ea3103e74b65987963a56c6 Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Wed, 26 May 2021 11:03:56 +0200 Subject: KVM: hyper-v: Advertise support for fast XMM hypercalls Now that kvm_hv_flush_tlb() has been patched to support XMM hypercall inputs, we can start advertising this feature to guests. Cc: Alexander Graf Cc: Evgeny Iakovlev Signed-off-by: Siddharth Chandrasekaran Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 7 ++++++- arch/x86/kvm/hyperv.c | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 27a9f08e8386..9fe4cc9c0f7d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -52,7 +52,7 @@ * Support for passing hypercall input parameter block via XMM * registers is available */ -#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE BIT(4) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) /* Support for a virtual guest idle state is available */ #define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) /* Frequency MSRs available */ @@ -61,6 +61,11 @@ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) /* Support for debug MSRs available */ #define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +/* + * Support for returning hypercall output block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15) /* stimer Direct Mode is available */ #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 449589e283d6..dbd3152b1379 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2243,6 +2243,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; + ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; -- cgit v1.2.3 From 3ad93562093d764bc22d6460e84ba60d0c57f7ab Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Thu, 29 Apr 2021 11:41:14 +0800 Subject: KVM: x86: Support write protecting only large pages Prepare for write protecting large page lazily during dirty log tracking, for which we will only need to write protect gfns at large page granularity. No functional or performance change expected. Signed-off-by: Keqian Zhu Message-Id: <20210429034115.35560-2-zhukeqian1@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 +++++---- arch/x86/kvm/mmu/mmu_internal.h | 3 ++- arch/x86/kvm/mmu/page_track.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 16 ++++++++++++---- arch/x86/kvm/mmu/tdp_mmu.h | 3 ++- 5 files changed, 22 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8ac1b9c935fe..a668d2050b79 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1249,20 +1249,21 @@ int kvm_cpu_dirty_log_size(void) } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn) + struct kvm_memory_slot *slot, u64 gfn, + int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } if (is_tdp_mmu_enabled(kvm)) write_protected |= - kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn); + kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } @@ -1272,7 +1273,7 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ff4c6256f3f9..18be103df9d5 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -128,7 +128,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); + struct kvm_memory_slot *slot, u64 gfn, + int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 34bb0ec69bd8..91a9f7e0fd91 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -100,7 +100,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, kvm_mmu_gfn_disallow_lpage(slot, gfn); if (mode == KVM_PAGE_TRACK_WRITE) - if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs(kvm); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 237317b1eddd..6b6dfcdcb179 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1462,15 +1462,22 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t gfn) + gfn_t gfn, int min_level) { struct tdp_iter iter; u64 new_spte; bool spte_set = false; + BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + rcu_read_lock(); - tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, gfn, gfn + 1) { + if (!is_shadow_present_pte(iter.old_spte) || + !is_last_spte(iter.old_spte, iter.level)) + continue; + if (!is_writable_pte(iter.old_spte)) break; @@ -1492,14 +1499,15 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn) + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level) { struct kvm_mmu_page *root; bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); for_each_tdp_mmu_root(kvm, root, slot->as_id) - spte_set |= write_protect_gfn(kvm, root, gfn); + spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 5fdf63090451..a861570fcd7c 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -74,7 +74,8 @@ bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn); + struct kvm_memory_slot *slot, gfn_t gfn, + int min_level); int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); -- cgit v1.2.3 From 8921291980db8184cdeb95987281c663f844b22c Mon Sep 17 00:00:00 2001 From: Keqian Zhu Date: Thu, 29 Apr 2021 11:41:15 +0800 Subject: KVM: x86: Do not write protect huge page in initially-all-set mode Currently, when dirty logging is started in initially-all-set mode, we write protect huge pages to prepare for splitting them into 4K pages, and leave normal pages untouched as the logging will be enabled lazily as dirty bits are cleared. However, enabling dirty logging lazily is also feasible for huge pages. This not only reduces the time of start dirty logging, but it also greatly reduces side-effect on guest when there is high dirty rate. Signed-off-by: Keqian Zhu Message-Id: <20210429034115.35560-3-zhukeqian1@huawei.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++++++++++++++---- arch/x86/kvm/x86.c | 37 ++++++++++--------------------------- 2 files changed, 36 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a668d2050b79..66e4d096fe05 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1172,8 +1172,7 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, * @gfn_offset: start of the BITS_PER_LONG pages we care about * @mask: indicates which pages we should protect * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * Used when we do not need to care about huge page mappings. */ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1230,13 +1229,36 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to * enable dirty logging for them. * - * Used when we do not need to care about huge page mappings: e.g. during dirty - * logging we do not have any such mappings. + * We need to care about huge page mappings: e.g. during dirty logging we may + * have such mappings. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { + /* + * Huge pages are NOT write protected when we start dirty logging in + * initially-all-set mode; must write protect them here so that they + * are split to 4K on the first write. + * + * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn + * of memslot has no such restriction, so the range can cross two large + * pages. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { + gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); + gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); + + kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); + + /* Cross two large pages? */ + if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != + ALIGN(end << PAGE_SHIFT, PMD_SIZE)) + kvm_mmu_slot_gfn_write_protect(kvm, slot, end, + PG_LEVEL_2M); + } + + /* Now handle 4K PTEs. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d425310054b..4ae708eb35f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11103,36 +11103,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, */ kvm_mmu_zap_collapsible_sptes(kvm, new); } else { - /* By default, write-protect everything to log writes. */ - int level = PG_LEVEL_4K; + /* + * Initially-all-set does not require write protecting any page, + * because they're all assumed to be dirty. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; if (kvm_x86_ops.cpu_dirty_log_size) { - /* - * Clear all dirty bits, unless pages are treated as - * dirty from the get-go. - */ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) - kvm_mmu_slot_leaf_clear_dirty(kvm, new); - - /* - * Write-protect large pages on write so that dirty - * logging happens at 4k granularity. No need to - * write-protect small SPTEs since write accesses are - * logged by the CPU via dirty bits. - */ - level = PG_LEVEL_2M; - } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { - /* - * If we're with initial-all-set, we don't need - * to write protect any small page because - * they're reported as dirty already. However - * we still need to write-protect huge pages - * so that the page split can happen lazily on - * the first write to the huge page. - */ - level = PG_LEVEL_2M; + kvm_mmu_slot_leaf_clear_dirty(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); + } else { + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } - kvm_mmu_slot_remove_write_access(kvm, new, level); } } -- cgit v1.2.3 From c9b929b3fadc0504605d29016eb8274358c7d3ed Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:08 -0700 Subject: KVM: x86/mmu: Deduplicate rmap freeing Small code deduplication. No functional change expected. Reviewed-by: David Hildenbrand Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-2-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ae708eb35f5..eaa01e6fe39b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10917,17 +10917,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +static void memslot_rmap_free(struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; + } +} - if (i == 0) - continue; +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + int i; + + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -10993,12 +10999,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } -- cgit v1.2.3 From 56dd1019c88510e79a820965a2da35907fbab00d Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:09 -0700 Subject: KVM: x86/mmu: Factor out allocating memslot rmap Small refactor to facilitate allocating rmaps for all memslots at once. No functional change expected. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-3-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eaa01e6fe39b..5f66a5972d82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10941,10 +10941,31 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } +static int memslot_rmap_alloc(struct kvm_memory_slot *slot, + unsigned long npages) +{ + const int sz = sizeof(*slot->arch.rmap[0]); + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + int level = i + 1; + int lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + if (!slot->arch.rmap[i]) { + memslot_rmap_free(slot); + return -ENOMEM; + } + } + + return 0; +} + static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i, r; /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The @@ -10953,7 +10974,11 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; @@ -10962,14 +10987,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap[i] = - kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; -- cgit v1.2.3 From a255740876f006eb9041fadcc4750557d26add5f Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:12 -0700 Subject: KVM: x86/mmu: Add a field to control memslot rmap allocation Add a field to control whether new memslots should have rmaps allocated for them. As of this change, it's not safe to skip allocating rmaps, so the field is always set to allocate rmaps. Future changes will make it safe to operate without rmaps, using the TDP MMU. Then further changes will allow the rmaps to be allocated lazily when needed for nested oprtation. No functional change expected. Reviewed-by: David Hildenbrand Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-6-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/mmu/mmu.c | 2 ++ arch/x86/kvm/x86.c | 13 ++++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9c7ced0e3171..11798a9ff3e9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1124,6 +1124,12 @@ struct kvm_arch { */ spinlock_t tdp_mmu_pages_lock; #endif /* CONFIG_X86_64 */ + + /* + * If set, rmaps have been allocated for all memslots and should be + * allocated for any newly created or modified memslots. + */ + bool memslots_have_rmaps; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 66e4d096fe05..64b3ee7ea467 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5511,6 +5511,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) kvm_mmu_init_tdp_mmu(kvm); + kvm->arch.memslots_have_rmaps = true; + node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f66a5972d82..11637fb10360 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10962,7 +10962,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, +static int kvm_alloc_memslot_metadata(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { int i, r; @@ -10974,9 +10975,11 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, */ memset(&slot->arch, 0, sizeof(slot->arch)); - r = memslot_rmap_alloc(slot, npages); - if (r) - return r; + if (kvm->arch.memslots_have_rmaps) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + } for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; @@ -11047,7 +11050,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, enum kvm_mr_change change) { if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(memslot, + return kvm_alloc_memslot_metadata(kvm, memslot, mem->memory_size >> PAGE_SHIFT); return 0; } -- cgit v1.2.3 From e2209710ccc5d28d8b88c822d2f3e03b269a2856 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:13 -0700 Subject: KVM: x86/mmu: Skip rmap operations if rmaps not allocated If only the TDP MMU is being used to manage the memory mappings for a VM, then many rmap operations can be skipped as they are guaranteed to be no-ops. This saves some time which would be spent on the rmap operation. It also avoids acquiring the MMU lock in write mode for many operations. This makes it safe to run the VM without rmaps allocated, when only using the TDP MMU and sets the stage for waiting to allocate the rmaps until they're needed. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-7-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 5 +++ arch/x86/kvm/mmu/mmu.c | 113 +++++++++++++++++++++++++++++-------------------- arch/x86/kvm/x86.c | 2 +- 3 files changed, 72 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 88d0ed5225a4..af09c47b1aa2 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -232,4 +232,9 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + return kvm->arch.memslots_have_rmaps; +} + #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64b3ee7ea467..2131f71577bc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1183,6 +1183,10 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1212,6 +1216,10 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); + + if (!kvm_memslots_have_rmaps(kvm)) + return; + while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); @@ -1278,9 +1286,11 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmap_head, true); + if (kvm_memslots_have_rmaps(kvm)) { + for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + rmap_head = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); + } } if (is_tdp_mmu_enabled(kvm)) @@ -1451,9 +1461,10 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); @@ -1463,9 +1474,10 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool flush; + bool flush = false; - flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); if (is_tdp_mmu_enabled(kvm)) flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); @@ -1518,9 +1530,10 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); @@ -1530,9 +1543,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { - bool young; + bool young = false; - young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); + if (kvm_memslots_have_rmaps(kvm)) + young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); if (is_tdp_mmu_enabled(kvm)) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); @@ -5534,29 +5548,29 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; - write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, - start, end - 1, true, flush); + flush = slot_handle_level_range(kvm, memslot, + kvm_zap_rmapp, PG_LEVEL_4K, + KVM_MAX_HUGEPAGE_LEVEL, start, + end - 1, true, flush); + } } + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + write_unlock(&kvm->mmu_lock); } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - - write_unlock(&kvm->mmu_lock); - if (is_tdp_mmu_enabled(kvm)) { flush = false; @@ -5583,12 +5597,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, int start_level) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); @@ -5658,16 +5675,15 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + if (flush) + kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - read_lock(&kvm->mmu_lock); flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); if (flush) @@ -5694,11 +5710,14 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush; + bool flush = false; - write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); - write_unlock(&kvm->mmu_lock); + if (kvm_memslots_have_rmaps(kvm)) { + write_lock(&kvm->mmu_lock); + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, + false); + write_unlock(&kvm->mmu_lock); + } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11637fb10360..ddeff81f90a4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10975,7 +10975,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, */ memset(&slot->arch, 0, sizeof(slot->arch)); - if (kvm->arch.memslots_have_rmaps) { + if (kvm_memslots_have_rmaps(kvm)) { r = memslot_rmap_alloc(slot, npages); if (r) return r; -- cgit v1.2.3 From d501f747ef5c0ac0c917f9a6781d04ae4ae39d63 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 18 May 2021 10:34:14 -0700 Subject: KVM: x86/mmu: Lazily allocate memslot rmaps If the TDP MMU is in use, wait to allocate the rmaps until the shadow MMU is actually used. (i.e. a nested VM is launched.) This saves memory equal to 0.2% of guest memory in cases where the TDP MMU is used and there are no nested guests involved. Signed-off-by: Ben Gardon Message-Id: <20210518173414.450044-8-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.h | 7 ++++++- arch/x86/kvm/mmu/mmu.c | 14 ++++++++++--- arch/x86/kvm/mmu/tdp_mmu.c | 6 ++++-- arch/x86/kvm/mmu/tdp_mmu.h | 4 ++-- arch/x86/kvm/x86.c | 46 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 71 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 11798a9ff3e9..dadb545c429f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1869,4 +1869,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) int kvm_cpu_dirty_log_size(void); +int alloc_all_memslots_rmaps(struct kvm *kvm); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index af09c47b1aa2..9d8550af994c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -234,7 +234,12 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { - return kvm->arch.memslots_have_rmaps; + /* + * Read memslot_have_rmaps before rmap pointers. Hence, threads reading + * memslots_have_rmaps in any lock context are guaranteed to see the + * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + */ + return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2131f71577bc..aa9e77f406d9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3312,6 +3312,10 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } + r = alloc_all_memslots_rmaps(vcpu->kvm); + if (r) + return r; + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) @@ -5523,9 +5527,13 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; - kvm_mmu_init_tdp_mmu(kvm); - - kvm->arch.memslots_have_rmaps = true; + if (!kvm_mmu_init_tdp_mmu(kvm)) + /* + * No smp_load/store wrappers needed here as we are in + * VM init and there cannot be any memslots / other threads + * accessing this struct kvm yet. + */ + kvm->arch.memslots_have_rmaps = true; node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6b6dfcdcb179..cc13e001f3de 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -14,10 +14,10 @@ static bool __read_mostly tdp_mmu_enabled = false; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ -void kvm_mmu_init_tdp_mmu(struct kvm *kvm) +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) - return; + return false; /* This should not be changed for the lifetime of the VM. */ kvm->arch.tdp_mmu_enabled = true; @@ -25,6 +25,8 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); + + return true; } static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index a861570fcd7c..f7a7990da11d 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -81,12 +81,12 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); #ifdef CONFIG_X86_64 -void kvm_mmu_init_tdp_mmu(struct kvm *kvm); +bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } #else -static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {} +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ddeff81f90a4..e838e999ab49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10952,6 +10952,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; + WARN_ON(slot->arch.rmap[i]); + slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { memslot_rmap_free(slot); @@ -10962,6 +10964,50 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } +int alloc_all_memslots_rmaps(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r, i; + + /* + * Check if memslots alreday have rmaps early before acquiring + * the slots_arch_lock below. + */ + if (kvm_memslots_have_rmaps(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Read memslots_have_rmaps again, under the slots arch lock, + * before allocating the rmaps + */ + if (kvm_memslots_have_rmaps(kvm)) { + mutex_unlock(&kvm->slots_arch_lock); + return 0; + } + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + r = memslot_rmap_alloc(slot, slot->npages); + if (r) { + mutex_unlock(&kvm->slots_arch_lock); + return r; + } + } + } + + /* + * Ensure that memslots_have_rmaps becomes true strictly after + * all the rmap pointers are set. + */ + smp_store_release(&kvm->arch.memslots_have_rmaps, true); + mutex_unlock(&kvm->slots_arch_lock); + return 0; +} + static int kvm_alloc_memslot_metadata(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) -- cgit v1.2.3 From 805d705ff8f3a05e63ce350ac0c37a3290ed9bb7 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:09 +0100 Subject: KVM: X86: Store L1's TSC scaling ratio in 'struct kvm_vcpu_arch' Store L1's scaling ratio in the kvm_vcpu_arch struct like we already do for L1's TSC offset. This allows for easy save/restore when we enter and then exit the nested guest. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-3-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 6 ++++-- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dadb545c429f..8808c8ae9370 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -707,7 +707,7 @@ struct kvm_vcpu_arch { } st; u64 l1_tsc_offset; - u64 tsc_offset; + u64 tsc_offset; /* current tsc offset */ u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; @@ -721,7 +721,8 @@ struct kvm_vcpu_arch { u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; u64 msr_ia32_power_ctl; - u64 tsc_scaling_ratio; + u64 l1_tsc_scaling_ratio; + u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c2a779b688e6..d3201efa6a07 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7453,10 +7453,10 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, delta_tsc = 0; /* Convert to host delta tsc if tsc scaling is enabled */ - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && delta_tsc && u64_shl_div_u64(delta_tsc, kvm_tsc_scaling_ratio_frac_bits, - vcpu->arch.tsc_scaling_ratio, &delta_tsc)) + vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) return -ERANGE; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e838e999ab49..571ee7ef3e0a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2185,6 +2185,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { + vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return 0; } @@ -2211,7 +2212,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return -1; } - vcpu->arch.tsc_scaling_ratio = ratio; + vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio; return 0; } @@ -2223,6 +2224,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ + vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return -1; } @@ -2459,7 +2461,7 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); adjust_tsc_offset_guest(vcpu, adjustment); -- cgit v1.2.3 From 9b399dfd4c60a2249f45f3938b1b9b49394dfe3a Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:10 +0100 Subject: KVM: X86: Rename kvm_compute_tsc_offset() to kvm_compute_l1_tsc_offset() All existing code uses kvm_compute_tsc_offset() passing L1 TSC values to it. Let's document this by renaming it to kvm_compute_l1_tsc_offset(). Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-4-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 571ee7ef3e0a..a1338bf871f7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2319,7 +2319,7 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) } EXPORT_SYMBOL_GPL(kvm_scale_tsc); -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; @@ -2363,7 +2363,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -2402,7 +2402,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -3252,7 +3252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, data); } else { - u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } @@ -4140,7 +4140,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mark_tsc_unstable("KVM discovered backwards TSC"); if (kvm_check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, + u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; -- cgit v1.2.3 From fe3eb50418174567f6fbfb3d90a95cbd7a0cc17b Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:11 +0100 Subject: KVM: X86: Add a ratio parameter to kvm_scale_tsc() Sometimes kvm_scale_tsc() needs to use the current scaling ratio and other times (like when reading the TSC from user space) it needs to use L1's scaling ratio. Have the caller specify this by passing the ratio as a parameter. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-5-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8808c8ae9370..d6bba19bc094 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1795,7 +1795,7 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr) return kvm_find_user_return_msr(msr) >= 0; } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1338bf871f7..a6d46520b550 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2307,10 +2307,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc) return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) { u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; if (ratio != kvm_default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); @@ -2323,14 +2322,15 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + + kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); @@ -2463,7 +2463,8 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -2846,7 +2847,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, @@ -3554,10 +3556,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * return L1's TSC value to ensure backwards-compatible * behavior for migration. */ - u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : - vcpu->arch.tsc_offset; + u64 offset, ratio; - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; + if (msr_info->host_initiated) { + offset = vcpu->arch.l1_tsc_offset; + ratio = vcpu->arch.l1_tsc_scaling_ratio; + } else { + offset = vcpu->arch.tsc_offset; + ratio = vcpu->arch.tsc_scaling_ratio; + } + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; break; } case MSR_MTRRcap: -- cgit v1.2.3 From 3c0f99366e34c1b45e4908e151089a8bf93fbe71 Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:12 +0100 Subject: KVM: nVMX: Add a TSC multiplier field in VMCS12 This is required for supporting nested TSC scaling. Signed-off-by: Ilias Stamatis Reviewed-by: Jim Mattson Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-6-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 034adb6404dc..d9f5d7c56ae3 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -37,6 +37,7 @@ const unsigned short vmcs_field_to_offset_table[] = { FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), FIELD64(PML_ADDRESS, pml_address), FIELD64(TSC_OFFSET, tsc_offset), + FIELD64(TSC_MULTIPLIER, tsc_multiplier), FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), FIELD64(APIC_ACCESS_ADDR, apic_access_addr), FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 13494956d0e9..bb81a23afe89 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -70,7 +70,8 @@ struct __packed vmcs12 { u64 eptp_list_address; u64 pml_address; u64 encls_exiting_bitmap; - u64 padding64[2]; /* room for future expansion */ + u64 tsc_multiplier; + u64 padding64[1]; /* room for future expansion */ /* * To allow migration of L1 (complete with its L2 guests) between * machines of different natural widths (32 or 64 bit), we cannot have @@ -258,6 +259,7 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(eptp_list_address, 304); CHECK_OFFSET(pml_address, 312); CHECK_OFFSET(encls_exiting_bitmap, 320); + CHECK_OFFSET(tsc_multiplier, 328); CHECK_OFFSET(cr0_guest_host_mask, 344); CHECK_OFFSET(cr4_guest_host_mask, 352); CHECK_OFFSET(cr0_read_shadow, 360); -- cgit v1.2.3 From 307a94c721fed1aaaeee68115df6f7fb8193b23f Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:13 +0100 Subject: KVM: X86: Add functions for retrieving L2 TSC fields from common code In order to implement as much of the nested TSC scaling logic as possible in common code, we need these vendor callbacks for retrieving the TSC offset and the TSC multiplier that L1 has set for L2. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-7-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 ++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ arch/x86/kvm/vmx/vmx.c | 23 +++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 3 +++ 5 files changed, 44 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index e7bef91cee04..c4906f73603d 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -87,6 +87,8 @@ KVM_X86_OP(set_identity_map_addr) KVM_X86_OP(get_mt_mask) KVM_X86_OP(load_mmu_pgd) KVM_X86_OP_NULL(has_wbinvd_exit) +KVM_X86_OP(get_l2_tsc_offset) +KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_l1_tsc_offset) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d6bba19bc094..6ec00427c6fd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1311,6 +1311,8 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); + u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8c3918a11826..95ae2734760e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1080,6 +1080,18 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.ctl.tsc_offset; +} + +static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + return kvm_default_tsc_scaling_ratio; +} + static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4524,6 +4536,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .get_l2_tsc_offset = svm_get_l2_tsc_offset, + .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, .write_l1_tsc_offset = svm_write_l1_tsc_offset, .load_mmu_pgd = svm_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d3201efa6a07..2ce2c73645bf 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1787,6 +1787,27 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->guest_uret_msrs_loaded = false; } +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) + return vmcs12->tsc_offset; + + return 0; +} + +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + return vmcs12->tsc_multiplier; + + return kvm_default_tsc_scaling_ratio; +} + static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -7700,6 +7721,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .load_mmu_pgd = vmx_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 16e4e457ba23..aa97c82e3451 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -404,6 +404,9 @@ void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type); +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); + static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool value) { -- cgit v1.2.3 From 83150f2932ec4712e2630009ac4a585d4aba7a9e Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:14 +0100 Subject: KVM: X86: Add functions that calculate the nested TSC fields When L2 is entered we need to "merge" the TSC multiplier and TSC offset values of 01 and 12 together. The merging is done using the following equations: offset_02 = ((offset_01 * mult_12) >> shift_bits) + offset_12 mult_02 = (mult_01 * mult_12) >> shift_bits Where shift_bits is kvm_tsc_scaling_ratio_frac_bits. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-8-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ec00427c6fd..14546c30bc63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1799,6 +1799,8 @@ static inline bool kvm_is_supported_user_return_msr(u32 msr) u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6d46520b550..61024ee9e85f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2334,6 +2334,31 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) +{ + u64 nested_offset; + + if (l2_multiplier == kvm_default_tsc_scaling_ratio) + nested_offset = l1_offset; + else + nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + nested_offset += l2_offset; + return nested_offset; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); + +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) +{ + if (l2_multiplier != kvm_default_tsc_scaling_ratio) + return mul_u64_u64_shr(l1_multiplier, l2_multiplier, + kvm_tsc_scaling_ratio_frac_bits); + + return l1_multiplier; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); + static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vcpu->arch.l1_tsc_offset = offset; -- cgit v1.2.3 From edcfe54058114cb3782cd2e919c224e14420e76e Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:15 +0100 Subject: KVM: X86: Move write_l1_tsc_offset() logic to common code and rename it The write_l1_tsc_offset() callback has a misleading name. It does not set L1's TSC offset, it rather updates the current TSC offset which might be different if a nested guest is executing. Additionally, both the vmx and svm implementations use the same logic for calculating the current TSC before writing it to hardware. Rename the function and move the common logic to the caller. The vmx/svm specific code now merely sets the given offset to the corresponding hardware structure. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-9-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 +- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/svm.c | 21 ++++----------------- arch/x86/kvm/vmx/vmx.c | 23 +++-------------------- arch/x86/kvm/x86.c | 24 +++++++++++++++++++++--- 5 files changed, 30 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c4906f73603d..026ca50ef73e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -89,7 +89,7 @@ KVM_X86_OP(load_mmu_pgd) KVM_X86_OP_NULL(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) -KVM_X86_OP(write_l1_tsc_offset) +KVM_X86_OP(write_tsc_offset) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 14546c30bc63..08773980393d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1313,8 +1313,7 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); - /* Returns actual tsc_offset set in active VMCS */ - u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); /* * Retrieve somewhat arbitrary exit information. Intended to be used diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 95ae2734760e..623f3c4b795a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1092,26 +1092,13 @@ static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_default_tsc_scaling_ratio; } -static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); - u64 g_tsc_offset = 0; - - if (is_guest_mode(vcpu)) { - /* Write L1's TSC offset. */ - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->vmcb01.ptr->control.tsc_offset; - svm->vmcb01.ptr->control.tsc_offset = offset; - } - - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - svm->vmcb->control.tsc_offset - g_tsc_offset, - offset); - - svm->vmcb->control.tsc_offset = offset + g_tsc_offset; + svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; + svm->vmcb->control.tsc_offset = offset; vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - return svm->vmcb->control.tsc_offset; } /* Evaluate instruction intercepts that depend on guest CPUID features. */ @@ -4538,7 +4525,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_l2_tsc_offset = svm_get_l2_tsc_offset, .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, - .write_l1_tsc_offset = svm_write_l1_tsc_offset, + .write_tsc_offset = svm_write_tsc_offset, .load_mmu_pgd = svm_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2ce2c73645bf..54d08bebf9c6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1808,26 +1808,9 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_default_tsc_scaling_ratio; } -static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u64 g_tsc_offset = 0; - - /* - * We're here if L1 chose not to trap WRMSR to TSC. According - * to the spec, this should set L1's TSC; The offset that L1 - * set for L2 remains unchanged, and still needs to be added - * to the newly set TSC to get L2's TSC. - */ - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - g_tsc_offset = vmcs12->tsc_offset; - - trace_kvm_write_tsc_offset(vcpu->vcpu_id, - vcpu->arch.tsc_offset - g_tsc_offset, - offset); - vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); - return offset + g_tsc_offset; + vmcs_write64(TSC_OFFSET, offset); } /* @@ -7723,7 +7706,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_l2_tsc_offset = vmx_get_l2_tsc_offset, .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + .write_tsc_offset = vmx_write_tsc_offset, .load_mmu_pgd = vmx_load_mmu_pgd, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 61024ee9e85f..b42f6c8674e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2359,10 +2359,28 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) } EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { - vcpu->arch.l1_tsc_offset = offset; - vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset); + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.l1_tsc_offset, + l1_offset); + + vcpu->arch.l1_tsc_offset = l1_offset; + + /* + * If we are here because L1 chose not to trap WRMSR to TSC then + * according to the spec this should set L1's TSC (as opposed to + * setting L1's offset for L2). + */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + l1_offset, + static_call(kvm_x86_get_l2_tsc_offset)(vcpu), + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_offset = l1_offset; + + static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); } static inline bool kvm_check_tsc_unstable(void) -- cgit v1.2.3 From 1ab9287add5e265352d18517551abf6d01d004fd Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Mon, 7 Jun 2021 11:54:38 +0100 Subject: KVM: X86: Add vendor callbacks for writing the TSC multiplier Currently vmx_vcpu_load_vmcs() writes the TSC_MULTIPLIER field of the VMCS every time the VMCS is loaded. Instead of doing this, set this field from common code on initialization and whenever the scaling ratio changes. Additionally remove vmx->current_tsc_ratio. This field is redundant as vcpu->arch.tsc_scaling_ratio already tracks the current TSC scaling ratio. The vmx->current_tsc_ratio field is only used for avoiding unnecessary writes but it is no longer needed after removing the code from the VMCS load path. Suggested-by: Sean Christopherson Signed-off-by: Ilias Stamatis Message-Id: <20210607105438.16541-1-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/nested.c | 9 ++++----- arch/x86/kvm/vmx/vmx.c | 11 ++++++----- arch/x86/kvm/vmx/vmx.h | 8 -------- arch/x86/kvm/x86.c | 30 +++++++++++++++++++++++------- 7 files changed, 41 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 026ca50ef73e..aeb5f1136718 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -90,6 +90,7 @@ KVM_X86_OP_NULL(has_wbinvd_exit) KVM_X86_OP(get_l2_tsc_offset) KVM_X86_OP(get_l2_tsc_multiplier) KVM_X86_OP(write_tsc_offset) +KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08773980393d..ca3b1925cffb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1314,6 +1314,7 @@ struct kvm_x86_ops { u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu, u64 multiplier); /* * Retrieve somewhat arbitrary exit information. Intended to be used diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 623f3c4b795a..a4d29ee9422d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1101,6 +1101,11 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); +} + /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) @@ -4526,6 +4531,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_l2_tsc_offset = svm_get_l2_tsc_offset, .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, .write_tsc_offset = svm_write_tsc_offset, + .write_tsc_multiplier = svm_write_tsc_multiplier, .load_mmu_pgd = svm_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6058a65a6ede..239154d3e4e7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2533,9 +2533,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); @@ -4501,12 +4500,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (kvm_has_tsc_control) + vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); + if (vmx->nested.l1_tpr_threshold != -1) vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); - if (kvm_has_tsc_control) - decache_tsc_multiplier(vmx); - if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 54d08bebf9c6..092a045de869 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1390,11 +1390,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, vmx->loaded_vmcs->cpu = cpu; } - - /* Setup TSC multiplier */ - if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) - decache_tsc_multiplier(vmx); } /* @@ -1813,6 +1808,11 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcs_write64(TSC_OFFSET, offset); } +static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +{ + vmcs_write64(TSC_MULTIPLIER, multiplier); +} + /* * nested_vmx_allowed() checks whether a guest should be allowed to use VMX * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for @@ -7707,6 +7707,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_l2_tsc_offset = vmx_get_l2_tsc_offset, .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, .load_mmu_pgd = vmx_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index aa97c82e3451..3eaa86a0ba3e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -322,8 +322,6 @@ struct vcpu_vmx { /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - u64 current_tsc_ratio; - unsigned long host_debugctlmsr; /* @@ -532,12 +530,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) -{ - vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); -} - static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { return vmx->secondary_exec_control & diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b42f6c8674e6..85b40e9191e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2179,14 +2179,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); + static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { - vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return 0; } @@ -2212,7 +2213,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return -1; } - vcpu->arch.l1_tsc_scaling_ratio = vcpu->arch.tsc_scaling_ratio = ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } @@ -2224,8 +2225,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.l1_tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); return -1; } @@ -2383,6 +2383,23 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); } +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) +{ + vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; + + /* Userspace is changing the multiplier while L2 is active */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + l1_multiplier, + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_scaling_ratio = l1_multiplier; + + if (kvm_has_tsc_control) + static_call(kvm_x86_write_tsc_multiplier)( + vcpu, vcpu->arch.tsc_scaling_ratio); +} + static inline bool kvm_check_tsc_unstable(void) { #ifdef CONFIG_X86_64 @@ -10364,8 +10381,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) return r; @@ -10433,6 +10448,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); + kvm_set_tsc_khz(vcpu, max_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu, false); vcpu_put(vcpu); -- cgit v1.2.3 From d041b5ea93352b3d226352a7238a89da2dd7becb Mon Sep 17 00:00:00 2001 From: Ilias Stamatis Date: Wed, 26 May 2021 19:44:17 +0100 Subject: KVM: nVMX: Enable nested TSC scaling Calculate the TSC offset and multiplier on nested transitions and expose the TSC scaling feature to L1. Signed-off-by: Ilias Stamatis Reviewed-by: Maxim Levitsky Message-Id: <20210526184418.28881-11-ilstam@amazon.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 239154d3e4e7..e8183e224706 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2277,7 +2277,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_ENABLE_VMFUNC); + SECONDARY_EXEC_ENABLE_VMFUNC | + SECONDARY_EXEC_TSC_SCALING); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2532,6 +2533,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + vmx_get_l2_tsc_offset(vcpu), + vmx_get_l2_tsc_multiplier(vcpu)); + + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + vcpu->arch.l1_tsc_scaling_ratio, + vmx_get_l2_tsc_multiplier(vcpu)); + vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (kvm_has_tsc_control) vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); @@ -3353,8 +3363,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } enter_guest_mode(vcpu); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset += vmcs12->tsc_offset; if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; @@ -4462,8 +4470,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (nested_cpu_has_preemption_timer(vmcs12)) hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) - vcpu->arch.tsc_offset -= vmcs12->tsc_offset; + if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { + vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + } if (likely(!vmx->fail)) { sync_vmcs02_to_vmcs12(vcpu, vmcs12); @@ -6473,7 +6484,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES; + SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_TSC_SCALING; /* * We can emulate "VMCS shadowing," even if the hardware -- cgit v1.2.3 From d82ee2819517eefd6f42465ccf3e3e621bbf4080 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:00 -0700 Subject: KVM: x86: Remove guest mode check from kvm_check_nested_events A survey of the callsites reveals that they all ensure the vCPU is in guest mode before calling kvm_check_nested_events. Remove this dead code so that the only negative value this function returns (at the moment) is -EBUSY. Signed-off-by: Jim Mattson Message-Id: <20210604172611.281819-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 85b40e9191e5..211d2dccb441 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8625,9 +8625,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) int kvm_check_nested_events(struct kvm_vcpu *vcpu) { - if (WARN_ON_ONCE(!is_guest_mode(vcpu))) - return -EIO; - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { kvm_x86_ops.nested_ops->triple_fault(vcpu); return 1; -- cgit v1.2.3 From 650293c3de6b042c4a2e87b2bc678efcff3843e8 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:02 -0700 Subject: KVM: nVMX: Add a return code to vmx_complete_nested_posted_interrupt No functional change intended. Signed-off-by: Jim Mattson Reviewed-by: Oliver Upton Message-Id: <20210604172611.281819-4-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e8183e224706..73f63ad06366 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3689,7 +3689,7 @@ void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) } } -static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -3697,17 +3697,17 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) u16 status; if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) - return; + return 0; vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return; + return 0; max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) - return; + return 0; __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); @@ -3720,6 +3720,7 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) } nested_mark_vmcs12_pages_dirty(vcpu); + return 0; } static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, @@ -3894,8 +3895,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } no_vmexit: - vmx_complete_nested_posted_interrupt(vcpu); - return 0; + return vmx_complete_nested_posted_interrupt(vcpu); } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From a5f6909a71f9223b7d7da71974bae226f94d9d68 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:03 -0700 Subject: KVM: x86: Add a return code to inject_pending_event No functional change intended. At present, 'r' will always be -EBUSY on a control transfer to the 'out' label. Signed-off-by: Jim Mattson Message-Id: <20210604172611.281819-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 211d2dccb441..1ae827f0d954 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8640,7 +8640,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) static_call(kvm_x86_queue_exception)(vcpu); } -static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) +static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; bool can_inject = true; @@ -8687,7 +8687,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) - goto busy; + goto out; } /* try to inject new event if pending */ @@ -8729,7 +8729,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (vcpu->arch.smi_pending) { r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; @@ -8742,7 +8742,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (vcpu->arch.nmi_pending) { r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; @@ -8757,7 +8757,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit if (kvm_cpu_has_injectable_intr(vcpu)) { r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; if (r < 0) - goto busy; + goto out; if (r) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); static_call(kvm_x86_set_irq)(vcpu); @@ -8773,11 +8773,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit *req_immediate_exit = true; WARN_ON(vcpu->arch.exception.pending); - return; + return 0; -busy: - *req_immediate_exit = true; - return; +out: + if (r == -EBUSY) { + *req_immediate_exit = true; + r = 0; + } + return r; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -9338,7 +9341,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu, &req_immediate_exit); + r = inject_pending_event(vcpu, &req_immediate_exit); + if (r < 0) { + r = 0; + goto out; + } if (req_int_win) static_call(kvm_x86_enable_irq_window)(vcpu); -- cgit v1.2.3 From 4fe09bcf14a666b8fa4d79ce1b4c87afa753f827 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:04 -0700 Subject: KVM: x86: Add a return code to kvm_apic_accept_events No functional change intended. At present, the only negative value returned by kvm_check_nested_events is -EBUSY. Signed-off-by: Jim Mattson Message-Id: <20210604172611.281819-6-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 11 ++++++----- arch/x86/kvm/lapic.h | 2 +- arch/x86/kvm/x86.c | 25 ++++++++++++++++++++----- 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 17fa4ab1b834..4b80e613096b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2872,7 +2872,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); } -void kvm_apic_accept_events(struct kvm_vcpu *vcpu) +int kvm_apic_accept_events(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u8 sipi_vector; @@ -2880,7 +2880,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) unsigned long pe; if (!lapic_in_kernel(vcpu)) - return; + return 0; /* * Read pending events before calling the check_events @@ -2888,12 +2888,12 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) */ pe = smp_load_acquire(&apic->pending_events); if (!pe) - return; + return 0; if (is_guest_mode(vcpu)) { r = kvm_check_nested_events(vcpu); if (r < 0) - return; + return r == -EBUSY ? 0 : r; /* * If an event has happened and caused a vmexit, * we know INITs are latched and therefore @@ -2914,7 +2914,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &pe)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); - return; + return 0; } if (test_bit(KVM_APIC_INIT, &pe)) { @@ -2935,6 +2935,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } + return 0; } void kvm_lapic_exit(void) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 997c45a5963a..d7c25d0c1354 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -76,7 +76,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); -void kvm_apic_accept_events(struct kvm_vcpu *vcpu); +int kvm_apic_accept_events(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1ae827f0d954..d1fdbaa6e1a9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9335,7 +9335,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; - kvm_apic_accept_events(vcpu); + r = kvm_apic_accept_events(vcpu); + if (r < 0) { + r = 0; + goto out; + } if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; goto out; @@ -9547,7 +9551,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) return 1; } - kvm_apic_accept_events(vcpu); + if (kvm_apic_accept_events(vcpu) < 0) + return 0; switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: @@ -9771,7 +9776,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } kvm_vcpu_block(vcpu); - kvm_apic_accept_events(vcpu); + if (kvm_apic_accept_events(vcpu) < 0) { + r = 0; + goto out; + } kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; if (signal_pending(current)) { @@ -9973,11 +9981,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int r; + vcpu_load(vcpu); if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); - kvm_apic_accept_events(vcpu); + r = kvm_apic_accept_events(vcpu); + if (r < 0) + goto out; + r = 0; + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && vcpu->arch.pv.pv_unhalted) @@ -9985,10 +9999,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; +out: if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); - return 0; + return r; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 0fe998b295a37234392072c23e22b8bba4774d0f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:05 -0700 Subject: KVM: nVMX: Fail on MMIO completion for nested posted interrupts When the kernel has no mapping for the vmcs02 virtual APIC page, userspace MMIO completion is necessary to process nested posted interrupts. This is not a configuration that KVM supports. Rather than silently ignoring the problem, try to exit to userspace with KVM_INTERNAL_ERROR. Note that the event that triggers this error is consumed as a side-effect of a call to kvm_check_nested_events. On some paths (notably through kvm_vcpu_check_block), the error is dropped. In any case, this is an incremental improvement over always ignoring the error. Signed-off-by: Jim Mattson Message-Id: <20210604172611.281819-7-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 73f63ad06366..4e545996440b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3707,7 +3707,7 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (max_irr != 256) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) - return 0; + goto mmio_needed; __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page, &max_irr); @@ -3721,6 +3721,10 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); return 0; + +mmio_needed: + kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + return -ENXIO; } static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 966eefb8965798478c2a6de3aa35ec180323792d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 4 Jun 2021 10:26:06 -0700 Subject: KVM: nVMX: Disable vmcs02 posted interrupts if vmcs12 PID isn't mappable Don't allow posted interrupts to modify a stale posted interrupt descriptor (including the initial value of 0). Empirical tests on real hardware reveal that a posted interrupt descriptor referencing an unbacked address has PCI bus error semantics (reads as all 1's; writes are ignored). However, kvm can't distinguish unbacked addresses from device-backed (MMIO) addresses, so it should really ask userspace for an MMIO completion. That's overly complicated, so just punt with KVM_INTERNAL_ERROR. Don't return the error until the posted interrupt descriptor is actually accessed. We don't want to break the existing kvm-unit-tests that assume they can launch an L2 VM with a posted interrupt descriptor that references MMIO space in L1. Fixes: 6beb7bd52e48 ("kvm: nVMX: Refactor nested_get_vmcs12_pages()") Signed-off-by: Jim Mattson Message-Id: <20210604172611.281819-8-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4e545996440b..98b5f5f104da 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3184,6 +3184,15 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) offset_in_page(vmcs12->posted_intr_desc_addr)); vmcs_write64(POSTED_INTR_DESC_ADDR, pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); + } else { + /* + * Defer the KVM_INTERNAL_EXIT until KVM tries to + * access the contents of the VMCS12 posted interrupt + * descriptor. (Note that KVM may do this when it + * should not, per the architectural specification.) + */ + vmx->nested.pi_desc = NULL; + pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); } } if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) @@ -3696,10 +3705,14 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) void *vapic_page; u16 status; - if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) + if (!vmx->nested.pi_pending) return 0; + if (!vmx->nested.pi_desc) + goto mmio_needed; + vmx->nested.pi_pending = false; + if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; -- cgit v1.2.3 From 7d62874f69d7e5c1c1063a5848075bd1adff3998 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sun, 6 Jun 2021 11:10:45 +0900 Subject: kvm: x86: implement KVM PM-notifier Implement PM hibernation/suspend prepare notifiers so that KVM can reliably set PVCLOCK_GUEST_STOPPED on VCPUs and properly suspend VMs. Signed-off-by: Sergey Senozhatsky Message-Id: <20210606021045.14159-2-senozhatsky@chromium.org> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index f6b93a35ce14..7a78b88c0f1a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -46,6 +46,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU + select HAVE_KVM_PM_NOTIFIER if PM help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d1fdbaa6e1a9..3c5a33ab10c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -58,6 +58,7 @@ #include #include #include +#include #include @@ -5701,6 +5702,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) return 0; } +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER +static int kvm_arch_suspend_notifier(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i, ret = 0; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!vcpu->arch.pv_time_enabled) + continue; + + ret = kvm_set_guest_paused(vcpu); + if (ret) { + kvm_err("Failed to pause guest VCPU%d: %d\n", + vcpu->vcpu_id, ret); + break; + } + } + mutex_unlock(&kvm->lock); + + return ret ? NOTIFY_BAD : NOTIFY_DONE; +} + +int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + return kvm_arch_suspend_notifier(kvm); + } + + return NOTIFY_DONE; +} +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { -- cgit v1.2.3 From fdf513e37a3bd9f498179c878cfcd59693bf507c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 9 Jun 2021 17:09:08 +0200 Subject: KVM: x86: Use common 'enable_apicv' variable for both APICv and AVIC Unify VMX and SVM code by moving APICv/AVIC enablement tracking to common 'enable_apicv' variable. Note: unlike APICv, AVIC is disabled by default. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210609150911.1471882-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/avic.c | 14 +++++--------- arch/x86/kvm/svm/svm.c | 23 ++++++++++++++--------- arch/x86/kvm/svm/svm.h | 2 -- arch/x86/kvm/vmx/capabilities.h | 1 - arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 3 +++ 7 files changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ca3b1925cffb..7f53e5fba735 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1432,6 +1432,7 @@ struct kvm_arch_async_pf { extern u32 __read_mostly kvm_nr_uret_msrs; extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; +extern bool __read_mostly enable_apicv; extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 5e7e920113f3..a9abed054cd5 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,10 +27,6 @@ #include "irq.h" #include "svm.h" -/* enable / disable AVIC */ -bool avic; -module_param(avic, bool, S_IRUGO); - #define SVM_AVIC_DOORBELL 0xc001011b #define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) @@ -124,7 +120,7 @@ void avic_vm_destroy(struct kvm *kvm) unsigned long flags; struct kvm_svm *kvm_svm = to_kvm_svm(kvm); - if (!avic) + if (!enable_apicv) return; if (kvm_svm->avic_logical_id_table_page) @@ -147,7 +143,7 @@ int avic_vm_init(struct kvm *kvm) struct page *l_page; u32 vm_id; - if (!avic) + if (!enable_apicv) return 0; /* Allocating physical APIC ID table (4KB) */ @@ -569,7 +565,7 @@ int avic_init_vcpu(struct vcpu_svm *svm) int ret; struct kvm_vcpu *vcpu = &svm->vcpu; - if (!avic || !irqchip_in_kernel(vcpu->kvm)) + if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) return 0; ret = avic_init_backing_page(vcpu); @@ -593,7 +589,7 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) { - if (!avic || !lapic_in_kernel(vcpu)) + if (!enable_apicv || !lapic_in_kernel(vcpu)) return; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -653,7 +649,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm->vmcb; bool activated = kvm_vcpu_apicv_active(vcpu); - if (!avic) + if (!enable_apicv) return; if (activated) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a4d29ee9422d..00ea8dc1bc9c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -185,6 +185,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* + * enable / disable AVIC. Because the defaults differ for APICv + * support between VMX and SVM we cannot use module_param_named. + */ +static bool avic; +module_param(avic, bool, 0444); + bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); @@ -1009,14 +1016,12 @@ static __init int svm_hardware_setup(void) nrips = false; } - if (avic) { - if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) { - avic = false; - } else { - pr_info("AVIC enabled\n"); + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); } if (vls) { @@ -4431,13 +4436,13 @@ static int svm_vm_init(struct kvm *kvm) if (!pause_filter_count || !pause_filter_thresh) kvm->arch.pause_in_guest = true; - if (avic) { + if (enable_apicv) { int ret = avic_vm_init(kvm); if (ret) return ret; } - kvm_apicv_init(kvm, avic); + kvm_apicv_init(kvm, enable_apicv); return 0; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 70419e417c0d..a514b490db4a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -479,8 +479,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -extern bool avic; - static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index aa0e7872fcc9..4705ad55abb5 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -12,7 +12,6 @@ extern bool __read_mostly enable_ept; extern bool __read_mostly enable_unrestricted_guest; extern bool __read_mostly enable_ept_ad_bits; extern bool __read_mostly enable_pml; -extern bool __read_mostly enable_apicv; extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 092a045de869..981361d095ed 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -101,7 +101,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); -bool __read_mostly enable_apicv = 1; module_param(enable_apicv, bool, S_IRUGO); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c5a33ab10c0..8324313f12b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -210,6 +210,9 @@ EXPORT_SYMBOL_GPL(host_efer); bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +bool __read_mostly enable_apicv = true; +EXPORT_SYMBOL_GPL(enable_apicv); + u64 __read_mostly host_xss; EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; -- cgit v1.2.3 From 4651fc56bad01d340844c5fbf1e1f817639208ab Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 9 Jun 2021 17:09:09 +0200 Subject: KVM: x86: Drop vendor specific functions for APICv/AVIC enablement Now that APICv/AVIC enablement is kept in common 'enable_apicv' variable, there's no need to call kvm_apicv_init() from vendor specific code. No functional change intended. Reviewed-by: Sean Christopherson Reviewed-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Message-Id: <20210609150911.1471882-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7f53e5fba735..ced3e3b94b77 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1672,7 +1672,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); -void kvm_apicv_init(struct kvm *kvm, bool enable); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 00ea8dc1bc9c..1e2c635d308c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4442,7 +4442,6 @@ static int svm_vm_init(struct kvm *kvm) return ret; } - kvm_apicv_init(kvm, enable_apicv); return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 981361d095ed..76586ce9cf76 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7004,7 +7004,6 @@ static int vmx_vm_init(struct kvm *kvm) break; } } - kvm_apicv_init(kvm, enable_apicv); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8324313f12b5..ec11ce280fdc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8471,16 +8471,15 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); -void kvm_apicv_init(struct kvm *kvm, bool enable) +static void kvm_apicv_init(struct kvm *kvm) { - if (enable) + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); else set_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); } -EXPORT_SYMBOL_GPL(kvm_apicv_init); static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { @@ -10885,6 +10884,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); -- cgit v1.2.3 From 25b17226cd9a77982fc8c915d4118d7238a0f079 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:12 -0700 Subject: KVM: x86: Emulate triple fault shutdown if RSM emulation fails Use the recently introduced KVM_REQ_TRIPLE_FAULT to properly emulate shutdown if RSM from SMM fails. Note, entering shutdown after clearing the SMM flag and restoring NMI blocking is architecturally correct with respect to AMD's APM, which KVM also uses for SMRAM layout and RSM NMI blocking behavior. The APM says: An RSM causes a processor shutdown if an invalid-state condition is found in the SMRAM state-save area. Only an external reset, external processor-initialization, or non-maskable external interrupt (NMI) can cause the processor to leave the shutdown state. Of note is processor-initialization (INIT) as a valid shutdown wake event, as INIT is blocked by SMM, implying that entering shutdown also forces the CPU out of SMM. For recent Intel CPUs, restoring NMI blocking is technically wrong, but so is restoring NMI blocking in the first place, and Intel's RSM "architecture" is such a mess that just about anything is allowed and can be justified as micro-architectural behavior. Per the SDM: On Pentium 4 and later processors, shutdown will inhibit INTR and A20M but will not change any of the other inhibits. On these processors, NMIs will be inhibited if no action is taken in the SMI handler to uninhibit them (see Section 34.8). where Section 34.8 says: When the processor enters SMM while executing an NMI handler, the processor saves the SMRAM state save map but does not save the attribute to keep NMI interrupts disabled. Potentially, an NMI could be latched (while in SMM or upon exit) and serviced upon exit of SMM even though the previous NMI handler has still not completed. I.e. RSM unconditionally unblocks NMI, but shutdown on RSM does not, which is in direct contradiction of KVM's behavior. But, as mentioned above, KVM follows AMD architecture and restores NMI blocking on RSM, so that micro-architectural detail is already lost. And for Pentium era CPUs, SMI# can break shutdown, meaning that at least some Intel CPUs fully leave SMM when entering shutdown: In the shutdown state, Intel processors stop executing instructions until a RESET#, INIT# or NMI# is asserted. While Pentium family processors recognize the SMI# signal in shutdown state, P6 family and Intel486 processors do not. In other words, the fact that Intel CPUs have implemented the two extremes gives KVM carte blanche when it comes to honoring Intel's architecture for handling shutdown during RSM. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-3-seanjc@google.com> [Return X86EMUL_CONTINUE after triple fault. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 12 +++++++----- arch/x86/kvm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 10e16a70b361..63f9ca1c0ce0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2580,7 +2580,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * state-save area. */ if (ctxt->ops->pre_leave_smm(ctxt, buf)) - return X86EMUL_UNHANDLEABLE; + goto emulate_shutdown; #ifdef CONFIG_X86_64 if (emulator_has_longmode(ctxt)) @@ -2589,14 +2589,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) #endif ret = rsm_load_state_32(ctxt, buf); - if (ret != X86EMUL_CONTINUE) { - /* FIXME: should triple fault */ - return X86EMUL_UNHANDLEABLE; - } + if (ret != X86EMUL_CONTINUE) + goto emulate_shutdown; ctxt->ops->post_leave_smm(ctxt); return X86EMUL_CONTINUE; + +emulate_shutdown: + ctxt->ops->triple_fault(ctxt); + return X86EMUL_CONTINUE; } static void diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index b063d376b7d9..357cfd1ccafd 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -234,6 +234,7 @@ struct x86_emulate_ops { int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); + void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec11ce280fdc..7bd1ddfec522 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7233,6 +7233,11 @@ static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(emul_to_vcpu(ctxt)); } +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) +{ + kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); +} + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) { return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); @@ -7282,6 +7287,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, + .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, }; -- cgit v1.2.3 From edce46548b70b8637694d96122447662ff35af0c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:13 -0700 Subject: KVM: x86: Replace .set_hflags() with dedicated .exiting_smm() helper Replace the .set_hflags() emulator hook with a dedicated .exiting_smm(), moving the SMM and SMM_INSIDE_NMI flag handling out of the emulator in the process. This is a step towards consolidating much of the logic in kvm_smm_changed(), including the SMM hflags updates. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 +-- arch/x86/kvm/kvm_emulate.h | 2 +- arch/x86/kvm/x86.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 63f9ca1c0ce0..4996eec7aa79 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2535,8 +2535,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & - ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + ctxt->ops->exiting_smm(ctxt); /* * Get back to real mode, to prepare a safe state in which to load diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 357cfd1ccafd..298bb0da7b97 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -230,7 +230,7 @@ struct x86_emulate_ops { void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); - void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); + void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bd1ddfec522..15a9859b6046 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7214,11 +7214,11 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - vcpu->arch.hflags = emul_flags; + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); kvm_mmu_reset_context(vcpu); } @@ -7284,7 +7284,7 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_fxsr = emulator_guest_has_fxsr, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .set_hflags = emulator_set_hflags, + .exiting_smm = emulator_exiting_smm, .pre_leave_smm = emulator_pre_leave_smm, .post_leave_smm = emulator_post_leave_smm, .triple_fault = emulator_triple_fault, -- cgit v1.2.3 From fa75e08bbe4f8ea609f61bbb6c04b3bb2b38c793 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:14 -0700 Subject: KVM: x86: Invoke kvm_smm_changed() immediately after clearing SMM flag Move RSM emulation's call to kvm_smm_changed() from .post_leave_smm() to .exiting_smm(), leaving behind the MMU context reset. The primary motivation is to allow for future cleanup, but this also fixes a bug of sorts by queueing KVM_REQ_EVENT even if RSM causes shutdown, e.g. to let an INIT wake the vCPU from shutdown. Of course, KVM doesn't properly emulate a shutdown state, e.g. KVM doesn't block SMIs after shutdown, and immediately exits to userspace, so the event request is a moot point in practice. Moving kvm_smm_changed() also moves the RSM tracepoint. This isn't strictly necessary, but will allow consolidating the SMI and RSM tracepoints in a future commit (by also moving the SMI tracepoint). Invoking the tracepoint before loading SMRAM state also means the SMBASE that reported in the tracepoint will point that the state that will be used for RSM, as opposed to the SMBASE _after_ RSM completes, which is arguably a good thing if the tracepoint is being used to debug a RSM/SMM issue. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15a9859b6046..774f2e7bedae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7219,7 +7219,7 @@ static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - kvm_mmu_reset_context(vcpu); + kvm_smm_changed(vcpu); } static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, @@ -7230,7 +7230,7 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) { - kvm_smm_changed(emul_to_vcpu(ctxt)); + kvm_mmu_reset_context(emul_to_vcpu(ctxt)); } static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) -- cgit v1.2.3 From dc87275f47332be922d4eb299595523cc3a97479 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:15 -0700 Subject: KVM: x86: Move (most) SMM hflags modifications into kvm_smm_changed() Move the core of SMM hflags modifications into kvm_smm_changed() and use kvm_smm_changed() in enter_smm(). Clear HF_SMM_INSIDE_NMI_MASK for leaving SMM but do not set it for entering SMM. If the vCPU is executing outside of SMM, the flag should unequivocally be cleared, e.g. this technically fixes a benign bug where the flag could be left set after KVM_SET_VCPU_EVENTS, but the reverse is not true as NMI blocking depends on pre-SMM state or userspace input. Note, this adds an extra kvm_mmu_reset_context() to enter_smm(). The extra/early reset isn't strictly necessary, and in a way can never be necessary since the vCPU/MMU context is in a half-baked state until the final context reset at the end of the function. But, enter_smm() is not a hot path, and exploding on an invalid root_hpa is probably better than having a stale SMM flag in the MMU role; it's at least no worse. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 774f2e7bedae..57efc3a49753 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4532,7 +4532,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu); +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -4592,13 +4592,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { - if (events->smi.smm) - vcpu->arch.hflags |= HF_SMM_MASK; - else - vcpu->arch.hflags &= ~HF_SMM_MASK; - kvm_smm_changed(vcpu); - } + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) + kvm_smm_changed(vcpu, events->smi.smm); vcpu->arch.smi_pending = events->smi.pending; @@ -7218,8 +7213,7 @@ static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - kvm_smm_changed(vcpu); + kvm_smm_changed(vcpu, false); } static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, @@ -7548,9 +7542,13 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu) +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - if (!(vcpu->arch.hflags & HF_SMM_MASK)) { + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); + /* This is a good place to trace that we are exiting SMM. */ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); @@ -9022,7 +9020,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) */ static_call(kvm_x86_pre_enter_smm)(vcpu, buf); - vcpu->arch.hflags |= HF_SMM_MASK; + kvm_smm_changed(vcpu, true); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); if (static_call(kvm_x86_get_nmi_mask)(vcpu)) -- cgit v1.2.3 From 0d7ee6f4b58dc6aca54df285cec027727c976892 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:16 -0700 Subject: KVM: x86: Move "entering SMM" tracepoint into kvm_smm_changed() Invoke the "entering SMM" tracepoint from kvm_smm_changed() instead of enter_smm(), effectively moving it from before reading vCPU state to after reading state (but still before writing it to SMRAM!). The primary motivation is to consolidate code, but calling the tracepoint from kvm_smm_changed() also makes its invocation consistent with respect to SMI and RSM, and with respect to KVM_SET_VCPU_EVENTS (which previously only invoked the tracepoint when forcing the vCPU out of SMM). Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 57efc3a49753..389f634a4083 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7544,14 +7544,13 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu); static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + if (entering_smm) { vcpu->arch.hflags |= HF_SMM_MASK; } else { vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - /* This is a good place to trace that we are exiting SMM. */ - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -9004,7 +9003,6 @@ static void enter_smm(struct kvm_vcpu *vcpu) char buf[512]; u32 cr0; - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) -- cgit v1.2.3 From 1270e647c802b427c8114816b0f35b961600f104 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:17 -0700 Subject: KVM: x86: Rename SMM tracepoint to make it reflect reality Rename the SMM tracepoint, which handles both entering and exiting SMM, from kvm_enter_smm to kvm_smm_transition. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4f839148948b..b484141ea15b 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -997,7 +997,7 @@ TRACE_EVENT(kvm_wait_lapic_expire, __entry->delta < 0 ? "early" : "late") ); -TRACE_EVENT(kvm_enter_smm, +TRACE_EVENT(kvm_smm_transition, TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), TP_ARGS(vcpu_id, smbase, entering), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 389f634a4083..1017d398e72d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7544,7 +7544,7 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu); static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); if (entering_smm) { vcpu->arch.hflags |= HF_SMM_MASK; -- cgit v1.2.3 From 0128116550acf52043a0aa5cca3caa85e3853aca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:18 -0700 Subject: KVM: x86: Drop .post_leave_smm(), i.e. the manual post-RSM MMU reset Drop the .post_leave_smm() emulator callback, which at this point is just a wrapper to kvm_mmu_reset_context(). The manual context reset is unnecessary, because unlike enter_smm() which calls vendor MSR/CR helpers directly, em_rsm() bounces through the KVM helpers, e.g. kvm_set_cr4(), which are responsible for processing side effects. em_rsm() is already subtly relying on this behavior as it doesn't manually do kvm_update_cpuid_runtime(), e.g. to recognize CR4.OSXSAVE changes. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 ++++++++-- arch/x86/kvm/kvm_emulate.h | 1 - arch/x86/kvm/x86.c | 6 ------ 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4996eec7aa79..83520a9f171e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2591,8 +2591,14 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (ret != X86EMUL_CONTINUE) goto emulate_shutdown; - ctxt->ops->post_leave_smm(ctxt); - + /* + * Note, the ctxt->ops callbacks are responsible for handling side + * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID + * runtime updates, etc... If that changes, e.g. this flow is moved + * out of the emulator to make it look more like enter_smm(), then + * those side effects need to be explicitly handled for both success + * and shutdown. + */ return X86EMUL_CONTINUE; emulate_shutdown: diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 298bb0da7b97..3ee701b0ef10 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -233,7 +233,6 @@ struct x86_emulate_ops { void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); - void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1017d398e72d..9a268728399e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7222,11 +7222,6 @@ static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); } -static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) -{ - kvm_mmu_reset_context(emul_to_vcpu(ctxt)); -} - static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); @@ -7280,7 +7275,6 @@ static const struct x86_emulate_ops emulate_ops = { .get_hflags = emulator_get_hflags, .exiting_smm = emulator_exiting_smm, .pre_leave_smm = emulator_pre_leave_smm, - .post_leave_smm = emulator_post_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, }; -- cgit v1.2.3 From ecc513e5bb7ed5d007dcaa533729360e9eb673ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 11:56:19 -0700 Subject: KVM: x86: Drop "pre_" from enter/leave_smm() helpers Now that .post_leave_smm() is gone, drop "pre_" from the remaining helpers. The helpers aren't invoked purely before SMI/RSM processing, e.g. both helpers are invoked after state is snapshotted (from regs or SMRAM), and the RSM helper is invoked after some amount of register state has been stuffed. Signed-off-by: Sean Christopherson Message-Id: <20210609185619.992058-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 4 ++-- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/emulate.c | 6 +++--- arch/x86/kvm/kvm_emulate.h | 3 +-- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 14 +++++++------- 7 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index aeb5f1136718..a12a4987154e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -109,8 +109,8 @@ KVM_X86_OP_NULL(set_hv_timer) KVM_X86_OP_NULL(cancel_hv_timer) KVM_X86_OP(setup_mce) KVM_X86_OP(smi_allowed) -KVM_X86_OP(pre_enter_smm) -KVM_X86_OP(pre_leave_smm) +KVM_X86_OP(enter_smm) +KVM_X86_OP(leave_smm) KVM_X86_OP(enable_smi_window) KVM_X86_OP_NULL(mem_enc_op) KVM_X86_OP_NULL(mem_enc_reg_region) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ced3e3b94b77..921de30c23c5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1372,8 +1372,8 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); + int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate); + int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 83520a9f171e..2837110e66ed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2574,11 +2574,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) } /* - * Give pre_leave_smm() a chance to make ISA-specific changes to the - * vCPU state (e.g. enter guest mode) before loading state from the SMM + * Give leave_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (ctxt->ops->pre_leave_smm(ctxt, buf)) + if (ctxt->ops->leave_smm(ctxt, buf)) goto emulate_shutdown; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 3ee701b0ef10..68b420289d7e 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -231,8 +231,7 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*exiting_smm)(struct x86_emulate_ctxt *ctxt); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, - const char *smstate); + int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1e2c635d308c..e7bec71a3d9b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4258,7 +4258,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !svm_smi_blocked(vcpu); } -static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); int ret; @@ -4280,7 +4280,7 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map; @@ -4555,8 +4555,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .setup_mce = svm_setup_mce, .smi_allowed = svm_smi_allowed, - .pre_enter_smm = svm_pre_enter_smm, - .pre_leave_smm = svm_pre_leave_smm, + .enter_smm = svm_enter_smm, + .leave_smm = svm_leave_smm, .enable_smi_window = svm_enable_smi_window, .mem_enc_op = svm_mem_enc_op, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 76586ce9cf76..51bbde75b1fd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7544,7 +7544,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) +static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7558,7 +7558,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) +static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7736,8 +7736,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .setup_mce = vmx_setup_mce, .smi_allowed = vmx_smi_allowed, - .pre_enter_smm = vmx_pre_enter_smm, - .pre_leave_smm = vmx_pre_leave_smm, + .enter_smm = vmx_enter_smm, + .leave_smm = vmx_leave_smm, .enable_smi_window = vmx_enable_smi_window, .can_emulate_instruction = vmx_can_emulate_instruction, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a268728399e..8d88e4513294 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7216,10 +7216,10 @@ static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) kvm_smm_changed(vcpu, false); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate); + return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); } static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) @@ -7274,7 +7274,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .exiting_smm = emulator_exiting_smm, - .pre_leave_smm = emulator_pre_leave_smm, + .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, }; @@ -9006,11 +9006,11 @@ static void enter_smm(struct kvm_vcpu *vcpu) enter_smm_save_state_32(vcpu, buf); /* - * Give pre_enter_smm() a chance to make ISA-specific changes to the - * vCPU state (e.g. leave guest mode) after we've saved the state into - * the SMM state-save area. + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. */ - static_call(kvm_x86_pre_enter_smm)(vcpu, buf); + static_call(kvm_x86_enter_smm)(vcpu, buf); kvm_smm_changed(vcpu, true); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); -- cgit v1.2.3 From b93af02c6722fde384ed2e921b71b61b9addb740 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 9 Jun 2021 14:03:38 -0400 Subject: KVM: nVMX: nSVM: 'nested_run' should count guest-entry attempts that make it to guest code Currently, the 'nested_run' statistic counts all guest-entry attempts, including those that fail during vmentry checks on Intel and during consistency checks on AMD. Convert this statistic to count only those guest-entries that make it past these state checks and make it to guest code. This will tell us the number of guest-entries that actually executed or tried to execute guest code. Signed-off-by: Krish Sadhukhan Message-Id: <20210609180340.104248-2-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 -- arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/nested.c | 2 -- arch/x86/kvm/vmx/vmx.c | 13 ++++++++++++- 4 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5e8d8443154e..34fc74b0d58a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -596,8 +596,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct kvm_host_map map; u64 vmcb12_gpa; - ++vcpu->stat.nested_run; - if (is_smm(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e7bec71a3d9b..d223f5dfac53 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3844,6 +3844,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm->next_rip = 0; if (is_guest_mode(vcpu)) { nested_sync_control_from_vmcb02(svm); + + /* Track VMRUNs that have made past consistency checking */ + if (svm->nested.nested_run_pending && + svm->vmcb->control.exit_code != SVM_EXIT_ERR) + ++vcpu->stat.nested_run; + svm->nested.nested_run_pending = 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 98b5f5f104da..e77b8ee28df8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3470,8 +3470,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); enum nested_evmptrld_status evmptrld_status; - ++vcpu->stat.nested_run; - if (!nested_vmx_check_permission(vcpu)) return 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 51bbde75b1fd..5aa0e54c793b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6809,7 +6809,18 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_load_host_xsave_state(vcpu); - vmx->nested.nested_run_pending = 0; + if (is_guest_mode(vcpu)) { + /* + * Track VMLAUNCH/VMRESUME that have made past guest state + * checking. + */ + if (vmx->nested.nested_run_pending && + !vmx->exit_reason.failed_vmentry) + ++vcpu->stat.nested_run; + + vmx->nested.nested_run_pending = 0; + } + vmx->idt_vectoring_info = 0; if (unlikely(vmx->fail)) { -- cgit v1.2.3 From d5a0483f9f3250fe359224327ca1b4a29d106981 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Wed, 9 Jun 2021 14:03:39 -0400 Subject: KVM: nVMX: nSVM: Add a new VCPU statistic to show if VCPU is in guest mode Add the following per-VCPU statistic to KVM debugfs to show if a given VCPU is in guest mode: guest_mode Also add this as a per-VM statistic to KVM debugfs to show the total number of VCPUs that are in guest mode in a given VM. Signed-off-by: Krish Sadhukhan Message-Id: <20210609180340.104248-3-krish.sadhukhan@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/debugfs.c | 11 +++++++++++ arch/x86/kvm/kvm_cache_regs.h | 3 +++ arch/x86/kvm/x86.c | 1 + 4 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 921de30c23c5..bea7290ef173 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1180,6 +1180,7 @@ struct kvm_vcpu_stat { u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; + u64 guest_mode; }; struct x86_instruction_info; diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 7e818d64bb4d..95a98413dc32 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -17,6 +17,15 @@ static int vcpu_get_timer_advance_ns(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_timer_advance_ns_fops, vcpu_get_timer_advance_ns, NULL, "%llu\n"); +static int vcpu_get_guest_mode(void *data, u64 *val) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; + *val = vcpu->stat.guest_mode; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(vcpu_guest_mode_fops, vcpu_get_guest_mode, NULL, "%lld\n"); + static int vcpu_get_tsc_offset(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; @@ -45,6 +54,8 @@ DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bi void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry) { + debugfs_create_file("guest_mode", 0444, debugfs_dentry, vcpu, + &vcpu_guest_mode_fops); debugfs_create_file("tsc-offset", 0444, debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3db5c42c9ecd..ebddbd37a0bf 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -162,6 +162,7 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; + vcpu->stat.guest_mode = 1; } static inline void leave_guest_mode(struct kvm_vcpu *vcpu) @@ -172,6 +173,8 @@ static inline void leave_guest_mode(struct kvm_vcpu *vcpu) vcpu->arch.load_eoi_exitmap_pending = false; kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); } + + vcpu->stat.guest_mode = 0; } static inline bool is_guest_mode(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8d88e4513294..0e2dbc7fdb97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -250,6 +250,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("nested_run", nested_run), VCPU_STAT("directed_yield_attempted", directed_yield_attempted), VCPU_STAT("directed_yield_successful", directed_yield_successful), + VCPU_STAT("guest_mode", guest_mode), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), -- cgit v1.2.3 From a6c776a952175e0fad22110e8d43019f3ac6f9af Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:34 +0000 Subject: hyperv: Detect Nested virtualization support for SVM Previously, to detect nested virtualization enlightenment support, we were using HV_X64_ENLIGHTENED_VMCS_RECOMMENDED feature bit of HYPERV_CPUID_ENLIGHTMENT_INFO.EAX CPUID as docuemented in TLFS: "Bit 14: Recommend a nested hypervisor using the enlightened VMCS interface. Also indicates that additional nested enlightenments may be available (see leaf 0x4000000A)". Enlightened VMCS, however, is an Intel only feature so the above detection method doesn't work for AMD. So, use the HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS.EAX CPUID information ("The maximum input value for hypervisor CPUID information.") and this works for both AMD and Intel. Signed-off-by: Vineeth Pillai Reviewed-by: Michael Kelley Message-Id: <43b25ff21cd2d9a51582033c9bdd895afefac056.1622730232.git.viremana@linux.microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/cpu/mshyperv.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 22f13343b5da..c268c2730048 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -252,6 +252,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) static void __init ms_hyperv_init_platform(void) { + int hv_max_functions_eax; int hv_host_info_eax; int hv_host_info_ebx; int hv_host_info_ecx; @@ -269,6 +270,8 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, ms_hyperv.misc_features); @@ -298,8 +301,7 @@ static void __init ms_hyperv_init_platform(void) /* * Extract host information. */ - if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >= - HYPERV_CPUID_VERSION) { + if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); @@ -325,9 +327,11 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); } - if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) { + if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { ms_hyperv.nested_features = cpuid_eax(HYPERV_CPUID_NESTED_FEATURES); + pr_info("Hyper-V: Nested features: 0x%x\n", + ms_hyperv.nested_features); } /* -- cgit v1.2.3 From 32431fb2538df56693a5852a50013549c827f57c Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:35 +0000 Subject: hyperv: SVM enlightened TLB flush support flag Bit 22 of HYPERV_CPUID_FEATURES.EDX is specific to SVM and specifies support for enlightened TLB flush. With this enlightenment enabled, ASID invalidations flushes only gva->hpa entries. To flush TLB entries derived from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace or HvFlushGuestPhysicalAddressList) Signed-off-by: Vineeth Pillai Reviewed-by: Michael Kelley Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 9fe4cc9c0f7d..f1366ce609e3 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -138,6 +138,15 @@ #define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) +/* + * This is specific to AMD and specifies that enlightened TLB flush is + * supported. If guest opts in to this feature, ASID invalidations only + * flushes gva -> hpa mapping entries. To flush the TLB entries derived + * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace + * or HvFlushGuestPhysicalAddressList). + */ +#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22) + /* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ #define HV_PARAVISOR_PRESENT BIT(0) -- cgit v1.2.3 From 3c86c0d3dbb98865a60a0c9d5c3a229af15a8a96 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:36 +0000 Subject: KVM: x86: hyper-v: Move the remote TLB flush logic out of vmx Currently the remote TLB flush logic is specific to VMX. Move it to a common place so that SVM can use it as well. Signed-off-by: Vineeth Pillai Message-Id: <4f4e4ca19778437dae502f44363a38e99e3ef5d1.1622730232.git.viremana@linux.microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 9 ++++ arch/x86/kvm/Makefile | 5 ++ arch/x86/kvm/kvm_onhyperv.c | 93 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/kvm_onhyperv.h | 32 ++++++++++++ arch/x86/kvm/vmx/vmx.c | 105 +--------------------------------------- arch/x86/kvm/vmx/vmx.h | 9 ---- arch/x86/kvm/x86.c | 9 ++++ 7 files changed, 150 insertions(+), 112 deletions(-) create mode 100644 arch/x86/kvm/kvm_onhyperv.c create mode 100644 arch/x86/kvm/kvm_onhyperv.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bea7290ef173..1fdb212127c4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -852,6 +852,10 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_tdp; +#endif }; struct kvm_lpage_info { @@ -1131,6 +1135,11 @@ struct kvm_arch { * allocated for any newly created or modified memslots. */ bool memslots_have_rmaps; + +#if IS_ENABLED(CONFIG_HYPERV) + hpa_t hv_root_tdp; + spinlock_t hv_root_tdp_lock; +#endif }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c589db5d91b3..a06745c2fef1 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -18,6 +18,11 @@ kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ mmu/spte.o + +ifdef CONFIG_HYPERV +kvm-y += kvm_onhyperv.o +endif + kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c new file mode 100644 index 000000000000..c7db2df50a7a --- /dev/null +++ b/arch/x86/kvm/kvm_onhyperv.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM L1 hypervisor optimizations on Hyper-V. + */ + +#include +#include + +#include "hyperv.h" +#include "kvm_onhyperv.h" + +static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, + void *data) +{ + struct kvm_tlb_range *range = data; + + return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, + range->pages); +} + +static inline int hv_remote_flush_root_tdp(hpa_t root_tdp, + struct kvm_tlb_range *range) +{ + if (range) + return hyperv_flush_guest_mapping_range(root_tdp, + kvm_fill_hv_flush_list_func, (void *)range); + else + return hyperv_flush_guest_mapping(root_tdp); +} + +int hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_tlb_range *range) +{ + struct kvm_arch *kvm_arch = &kvm->arch; + struct kvm_vcpu *vcpu; + int ret = 0, i, nr_unique_valid_roots; + hpa_t root; + + spin_lock(&kvm_arch->hv_root_tdp_lock); + + if (!VALID_PAGE(kvm_arch->hv_root_tdp)) { + nr_unique_valid_roots = 0; + + /* + * Flush all valid roots, and see if all vCPUs have converged + * on a common root, in which case future flushes can skip the + * loop and flush the common root. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + root = vcpu->arch.hv_root_tdp; + if (!VALID_PAGE(root) || root == kvm_arch->hv_root_tdp) + continue; + + /* + * Set the tracked root to the first valid root. Keep + * this root for the entirety of the loop even if more + * roots are encountered as a low effort optimization + * to avoid flushing the same (first) root again. + */ + if (++nr_unique_valid_roots == 1) + kvm_arch->hv_root_tdp = root; + + if (!ret) + ret = hv_remote_flush_root_tdp(root, range); + + /* + * Stop processing roots if a failure occurred and + * multiple valid roots have already been detected. + */ + if (ret && nr_unique_valid_roots > 1) + break; + } + + /* + * The optimized flush of a single root can't be used if there + * are multiple valid roots (obviously). + */ + if (nr_unique_valid_roots > 1) + kvm_arch->hv_root_tdp = INVALID_PAGE; + } else { + ret = hv_remote_flush_root_tdp(kvm_arch->hv_root_tdp, range); + } + + spin_unlock(&kvm_arch->hv_root_tdp_lock); + return ret; +} +EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range); + +int hv_remote_flush_tlb(struct kvm *kvm) +{ + return hv_remote_flush_tlb_with_range(kvm, NULL); +} +EXPORT_SYMBOL_GPL(hv_remote_flush_tlb); diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h new file mode 100644 index 000000000000..1c67abf2eba9 --- /dev/null +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM L1 hypervisor optimizations on Hyper-V. + */ + +#ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__ +#define __ARCH_X86_KVM_KVM_ONHYPERV_H__ + +#if IS_ENABLED(CONFIG_HYPERV) +int hv_remote_flush_tlb_with_range(struct kvm *kvm, + struct kvm_tlb_range *range); +int hv_remote_flush_tlb(struct kvm *kvm); + +static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ + struct kvm_arch *kvm_arch = &vcpu->kvm->arch; + + if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { + spin_lock(&kvm_arch->hv_root_tdp_lock); + vcpu->arch.hv_root_tdp = root_tdp; + if (root_tdp != kvm_arch->hv_root_tdp) + kvm_arch->hv_root_tdp = INVALID_PAGE; + spin_unlock(&kvm_arch->hv_root_tdp_lock); + } +} +#else /* !CONFIG_HYPERV */ +static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp) +{ +} +#endif /* !CONFIG_HYPERV */ + +#endif diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5aa0e54c793b..e3f744bec763 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -52,6 +52,7 @@ #include "cpuid.h" #include "evmcs.h" #include "hyperv.h" +#include "kvm_onhyperv.h" #include "irq.h" #include "kvm_cache_regs.h" #include "lapic.h" @@ -458,86 +459,6 @@ static unsigned long host_idt_base; static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); -static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, - void *data) -{ - struct kvm_tlb_range *range = data; - - return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, - range->pages); -} - -static inline int hv_remote_flush_root_ept(hpa_t root_ept, - struct kvm_tlb_range *range) -{ - if (range) - return hyperv_flush_guest_mapping_range(root_ept, - kvm_fill_hv_flush_list_func, (void *)range); - else - return hyperv_flush_guest_mapping(root_ept); -} - -static int hv_remote_flush_tlb_with_range(struct kvm *kvm, - struct kvm_tlb_range *range) -{ - struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); - struct kvm_vcpu *vcpu; - int ret = 0, i, nr_unique_valid_roots; - hpa_t root; - - spin_lock(&kvm_vmx->hv_root_ept_lock); - - if (!VALID_PAGE(kvm_vmx->hv_root_ept)) { - nr_unique_valid_roots = 0; - - /* - * Flush all valid roots, and see if all vCPUs have converged - * on a common root, in which case future flushes can skip the - * loop and flush the common root. - */ - kvm_for_each_vcpu(i, vcpu, kvm) { - root = to_vmx(vcpu)->hv_root_ept; - if (!VALID_PAGE(root) || root == kvm_vmx->hv_root_ept) - continue; - - /* - * Set the tracked root to the first valid root. Keep - * this root for the entirety of the loop even if more - * roots are encountered as a low effort optimization - * to avoid flushing the same (first) root again. - */ - if (++nr_unique_valid_roots == 1) - kvm_vmx->hv_root_ept = root; - - if (!ret) - ret = hv_remote_flush_root_ept(root, range); - - /* - * Stop processing roots if a failure occurred and - * multiple valid roots have already been detected. - */ - if (ret && nr_unique_valid_roots > 1) - break; - } - - /* - * The optimized flush of a single root can't be used if there - * are multiple valid roots (obviously). - */ - if (nr_unique_valid_roots > 1) - kvm_vmx->hv_root_ept = INVALID_PAGE; - } else { - ret = hv_remote_flush_root_ept(kvm_vmx->hv_root_ept, range); - } - - spin_unlock(&kvm_vmx->hv_root_ept_lock); - return ret; -} -static int hv_remote_flush_tlb(struct kvm *kvm) -{ - return hv_remote_flush_tlb_with_range(kvm, NULL); -} - static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) { struct hv_enlightened_vmcs *evmcs; @@ -565,21 +486,6 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) #endif /* IS_ENABLED(CONFIG_HYPERV) */ -static void hv_track_root_ept(struct kvm_vcpu *vcpu, hpa_t root_ept) -{ -#if IS_ENABLED(CONFIG_HYPERV) - struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); - - if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) { - spin_lock(&kvm_vmx->hv_root_ept_lock); - to_vmx(vcpu)->hv_root_ept = root_ept; - if (root_ept != kvm_vmx->hv_root_ept) - kvm_vmx->hv_root_ept = INVALID_PAGE; - spin_unlock(&kvm_vmx->hv_root_ept_lock); - } -#endif -} - /* * Comment's format: document - errata name - stepping - processor name. * Refer from @@ -3184,7 +3090,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, eptp = construct_eptp(vcpu, root_hpa, root_level); vmcs_write64(EPT_POINTER, eptp); - hv_track_root_ept(vcpu, root_hpa); + hv_track_root_tdp(vcpu, root_hpa); if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; @@ -6966,9 +6872,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.sn = 1; -#if IS_ENABLED(CONFIG_HYPERV) - vmx->hv_root_ept = INVALID_PAGE; -#endif return 0; free_vmcs: @@ -6985,10 +6888,6 @@ free_vpid: static int vmx_vm_init(struct kvm *kvm) { -#if IS_ENABLED(CONFIG_HYPERV) - spin_lock_init(&to_kvm_vmx(kvm)->hv_root_ept_lock); -#endif - if (!ple_gap) kvm->arch.pause_in_guest = true; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3eaa86a0ba3e..5740f8e2aa23 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -334,10 +334,6 @@ struct vcpu_vmx { /* SGX Launch Control public key hash */ u64 msr_ia32_sgxlepubkeyhash[4]; -#if IS_ENABLED(CONFIG_HYPERV) - u64 hv_root_ept; -#endif - struct pt_desc pt_desc; struct lbr_desc lbr_desc; @@ -355,11 +351,6 @@ struct kvm_vmx { unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; - -#if IS_ENABLED(CONFIG_HYPERV) - hpa_t hv_root_ept; - spinlock_t hv_root_ept_lock; -#endif }; bool nested_vmx_allowed(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e2dbc7fdb97..63e48738764e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10494,6 +10494,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; vcpu->arch.preempted_in_kernel = false; +#if IS_ENABLED(CONFIG_HYPERV) + vcpu->arch.hv_root_tdp = INVALID_PAGE; +#endif + r = static_call(kvm_x86_vcpu_create)(vcpu); if (r) goto free_guest_fpu; @@ -10878,6 +10882,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.guest_can_read_msr_platform_info = true; +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&kvm->arch.hv_root_tdp_lock); + kvm->arch.hv_root_tdp = INVALID_PAGE; +#endif + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); -- cgit v1.2.3 From 59d21d67f37481cfde25551ee6a467fa943812b4 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:37 +0000 Subject: KVM: SVM: Software reserved fields SVM added support for certain reserved fields to be used by software or hypervisor. Add the following reserved fields: - VMCB offset 0x3e0 - 0x3ff - Clean bit 31 - SVM intercept exit code 0xf0000000 Later patches will make use of this for supporting Hyper-V nested virtualization enhancements. Signed-off-by: Vineeth Pillai Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 9 +++++++-- arch/x86/include/uapi/asm/svm.h | 3 +++ arch/x86/kvm/svm/svm.h | 17 +++++++++++++++-- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 772e60efe243..e322676039f4 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -156,6 +156,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 avic_physical_id; /* Offset 0xf8 */ u8 reserved_7[8]; u64 vmsa_pa; /* Used for an SEV-ES guest */ + u8 reserved_8[720]; + /* + * Offset 0x3e0, 32 bytes reserved + * for use by hypervisor/software. + */ + u8 reserved_sw[32]; }; @@ -314,7 +320,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_VMCB_CONTROL_AREA_SIZE 272 +#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) @@ -326,7 +332,6 @@ static inline void __unused_size_checks(void) struct vmcb { struct vmcb_control_area control; - u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; } __packed; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 554f75fe013c..efa969325ede 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -110,6 +110,9 @@ #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff +/* Exit code reserved for hypervisor/software use */ +#define SVM_EXIT_SW 0xf0000000 + #define SVM_EXIT_ERR -1 #define SVM_EXIT_REASONS \ diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index a514b490db4a..af09bcd229bd 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,11 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +/* + * Clean bits in VMCB. + * VMCB_ALL_CLEAN_MASK might also need to + * be updated if this enum is modified. + */ enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, pause filter count */ @@ -48,9 +53,17 @@ enum { * AVIC PHYSICAL_TABLE pointer, * AVIC LOGICAL_TABLE pointer */ - VMCB_DIRTY_MAX, + VMCB_SW = 31, /* Reserved for hypervisor/software use */ }; +#define VMCB_ALL_CLEAN_MASK ( \ + (1U << VMCB_INTERCEPTS) | (1U << VMCB_PERM_MAP) | \ + (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ + (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ + (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ + (1U << VMCB_SW)) + /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) @@ -237,7 +250,7 @@ static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) static inline void vmcb_mark_all_clean(struct vmcb *vmcb) { - vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) + vmcb->control.clean = VMCB_ALL_CLEAN_MASK & ~VMCB_ALWAYS_DIRTY_MASK; } -- cgit v1.2.3 From 1e0c7d40758bcd45b4af936031144e995f87a7f6 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:38 +0000 Subject: KVM: SVM: hyper-v: Remote TLB flush for SVM Enable remote TLB flush for SVM. Signed-off-by: Vineeth Pillai Message-Id: <1ee364e397e142aed662d2920d198cd03772f1a5.1622730232.git.viremana@linux.microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 9 ++++++ arch/x86/kvm/svm/svm_onhyperv.h | 66 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 arch/x86/kvm/svm/svm_onhyperv.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d223f5dfac53..4d7b67c78a89 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -43,6 +43,9 @@ #include "svm.h" #include "svm_ops.h" +#include "kvm_onhyperv.h" +#include "svm_onhyperv.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -1003,6 +1006,8 @@ static __init int svm_hardware_setup(void) /* Note, SEV setup consumes npt_enabled. */ sev_hardware_setup(); + svm_hv_hardware_setup(); + svm_adjust_mmio_mask(); for_each_possible_cpu(cpu) { @@ -1296,6 +1301,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } } + svm_hv_init_vmcb(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -3892,6 +3899,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); vmcb_mark_dirty(svm->vmcb, VMCB_NPT); + hv_track_root_tdp(vcpu, root_hpa); + /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) return; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h new file mode 100644 index 000000000000..57291e222395 --- /dev/null +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM L1 hypervisor optimizations on Hyper-V for SVM. + */ + +#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__ +#define __ARCH_X86_KVM_SVM_ONHYPERV_H__ + +#if IS_ENABLED(CONFIG_HYPERV) +#include + +#include "hyperv.h" +#include "kvm_onhyperv.h" + +static struct kvm_x86_ops svm_x86_ops; + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB + * control area to expose SVM enlightenments to guests. + */ +struct hv_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 enlightened_npt_tlb: 1; + u32 reserved:29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +static inline void svm_hv_init_vmcb(struct vmcb *vmcb) +{ + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) + hve->hv_enlightenments_control.enlightened_npt_tlb = 1; +} + +static inline void svm_hv_hardware_setup(void) +{ + if (npt_enabled && + ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { + pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n"); + svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; + svm_x86_ops.tlb_remote_flush_with_range = + hv_remote_flush_tlb_with_range; + } +} + +#else + +static inline void svm_hv_init_vmcb(struct vmcb *vmcb) +{ +} + +static inline void svm_hv_hardware_setup(void) +{ +} +#endif /* CONFIG_HYPERV */ + +#endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */ -- cgit v1.2.3 From c4327f15dfc7294b2abde0ea49b3e43eec3cca38 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:39 +0000 Subject: KVM: SVM: hyper-v: Enlightened MSR-Bitmap support Enlightened MSR-Bitmap as per TLFS: "The L1 hypervisor may collaborate with the L0 hypervisor to make MSR accesses more efficient. It can enable enlightened MSR bitmaps by setting the corresponding field in the enlightened VMCS to 1. When enabled, L0 hypervisor does not monitor the MSR bitmaps for changes. Instead, the L1 hypervisor must invalidate the corresponding clean field after making changes to one of the MSR bitmaps." Enable this for SVM. Related VMX changes: commit ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support") Signed-off-by: Vineeth Pillai Message-Id: <87df0710f95d28b91cc4ea014fc4d71056eebbee.1622730232.git.viremana@linux.microsoft.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 3 +++ arch/x86/kvm/svm/svm.h | 5 +++++ arch/x86/kvm/svm/svm_onhyperv.h | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4d7b67c78a89..1b0056ef36af 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -683,6 +683,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); msrpm[offset] = tmp; + + svm_hv_vmcb_dirty_nested_enlightenments(vcpu); + } void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index af09bcd229bd..670f0c0ed73b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -254,6 +254,11 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb) & ~VMCB_ALWAYS_DIRTY_MASK; } +static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit) +{ + return (vmcb->control.clean & (1 << bit)); +} + static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 57291e222395..0f262460b2e6 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -31,6 +31,11 @@ struct hv_enlightenments { u64 reserved; } __packed; +/* + * Hyper-V uses the software reserved clean bit in VMCB + */ +#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { struct hv_enlightenments *hve = @@ -52,6 +57,23 @@ static inline void svm_hv_hardware_setup(void) } } +static inline void svm_hv_vmcb_dirty_nested_enlightenments( + struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = to_svm(vcpu)->vmcb; + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + + /* + * vmcb can be NULL if called during early vcpu init. + * And its okay not to mark vmcb dirty during vcpu init + * as we mark it dirty unconditionally towards end of vcpu + * init phase. + */ + if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && + hve->hv_enlightenments_control.msr_bitmap) + vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); +} #else static inline void svm_hv_init_vmcb(struct vmcb *vmcb) @@ -61,6 +83,11 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb) static inline void svm_hv_hardware_setup(void) { } + +static inline void svm_hv_vmcb_dirty_nested_enlightenments( + struct kvm_vcpu *vcpu) +{ +} #endif /* CONFIG_HYPERV */ #endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */ -- cgit v1.2.3 From 1183646a67d01ef9c46ac87da1c57dea5f7bb153 Mon Sep 17 00:00:00 2001 From: Vineeth Pillai Date: Thu, 3 Jun 2021 15:14:40 +0000 Subject: KVM: SVM: hyper-v: Direct Virtual Flush support From Hyper-V TLFS: "The hypervisor exposes hypercalls (HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressList, and HvFlushVirtualAddressListEx) that allow operating systems to more efficiently manage the virtual TLB. The L1 hypervisor can choose to allow its guest to use those hypercalls and delegate the responsibility to handle them to the L0 hypervisor. This requires the use of a partition assist page." Add the Direct Virtual Flush support for SVM. Related VMX changes: commit 6f6a657c9998 ("KVM/Hyper-V/VMX: Add direct tlb flush support") Signed-off-by: Vineeth Pillai Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 4 ++++ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/svm/svm_onhyperv.c | 41 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm_onhyperv.h | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+) create mode 100644 arch/x86/kvm/svm/svm_onhyperv.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a06745c2fef1..83331376b779 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -32,6 +32,10 @@ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o +ifdef CONFIG_HYPERV +kvm-amd-y += svm/svm_onhyperv.o +endif + obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1b0056ef36af..9bb4692728ef 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3781,6 +3781,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->save.cr2 = vcpu->arch.cr2; + svm_hv_update_vp_id(svm->vmcb, vcpu); + /* * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c new file mode 100644 index 000000000000..98aa981c04ec --- /dev/null +++ b/arch/x86/kvm/svm/svm_onhyperv.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM L1 hypervisor optimizations on Hyper-V for SVM. + */ + +#include +#include "kvm_cache_regs.h" + +#include + +#include "svm.h" +#include "svm_ops.h" + +#include "hyperv.h" +#include "kvm_onhyperv.h" +#include "svm_onhyperv.h" + +int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) +{ + struct hv_enlightenments *hve; + struct hv_partition_assist_pg **p_hv_pa_pg = + &to_kvm_hv(vcpu->kvm)->hv_pa_pg; + + if (!*p_hv_pa_pg) + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); + + if (!*p_hv_pa_pg) + return -ENOMEM; + + hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw; + + hve->partition_assist_page = __pa(*p_hv_pa_pg); + hve->hv_vm_id = (unsigned long)vcpu->kvm; + if (!hve->hv_enlightenments_control.nested_flush_hypercall) { + hve->hv_enlightenments_control.nested_flush_hypercall = 1; + vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + } + + return 0; +} + diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 0f262460b2e6..9b9a55abc29f 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -36,6 +36,8 @@ struct hv_enlightenments { */ #define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW +int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu); + static inline void svm_hv_init_vmcb(struct vmcb *vmcb) { struct hv_enlightenments *hve = @@ -55,6 +57,23 @@ static inline void svm_hv_hardware_setup(void) svm_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; } + + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { + int cpu; + + pr_info("kvm: Hyper-V Direct TLB Flush enabled\n"); + for_each_online_cpu(cpu) { + struct hv_vp_assist_page *vp_ap = + hv_get_vp_assist_page(cpu); + + if (!vp_ap) + continue; + + vp_ap->nested_control.features.directhypercall = 1; + } + svm_x86_ops.enable_direct_tlbflush = + svm_hv_enable_direct_tlbflush; + } } static inline void svm_hv_vmcb_dirty_nested_enlightenments( @@ -74,6 +93,19 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } + +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, + struct kvm_vcpu *vcpu) +{ + struct hv_enlightenments *hve = + (struct hv_enlightenments *)vmcb->control.reserved_sw; + u32 vp_index = kvm_hv_get_vpindex(vcpu); + + if (hve->hv_vp_id != vp_index) { + hve->hv_vp_id = vp_index; + vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); + } +} #else static inline void svm_hv_init_vmcb(struct vmcb *vmcb) @@ -88,6 +120,11 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( struct kvm_vcpu *vcpu) { } + +static inline void svm_hv_update_vp_id(struct vmcb *vmcb, + struct kvm_vcpu *vcpu) +{ +} #endif /* CONFIG_HYPERV */ #endif /* __ARCH_X86_KVM_SVM_ONHYPERV_H__ */ -- cgit v1.2.3 From 644f706719f0297bc5f65c8891de1c32f042eae5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:36 +0200 Subject: KVM: x86: hyper-v: Introduce KVM_CAP_HYPERV_ENFORCE_CPUID Modeled after KVM_CAP_ENFORCE_PV_FEATURE_CPUID, the new capability allows for limiting Hyper-V features to those exposed to the guest in Hyper-V CPUIDs (0x40000003, 0x40000004, ...). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 +++++++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/hyperv.c | 21 +++++++++++++++++++++ arch/x86/kvm/hyperv.h | 1 + arch/x86/kvm/x86.c | 4 ++++ include/uapi/linux/kvm.h | 1 + 6 files changed, 39 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7fcb2fd38f42..80154d5d98a1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6891,3 +6891,14 @@ This capability is always enabled. This capability indicates that the KVM virtual PTP service is supported in the host. A VMM can check whether the service is available to the guest on migration. + +8.33 KVM_CAP_HYPERV_ENFORCE_CPUID +----------------------------- + +Architectures: x86 + +When enabled, KVM will disable emulated Hyper-V features provided to the +guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all +currently implmented Hyper-V features are provided unconditionally when +Hyper-V identification is set in the HYPERV_CPUID_INTERFACE (0x40000001) +leaf. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1fdb212127c4..556a8ec89451 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -543,6 +543,7 @@ struct kvm_vcpu_hv { struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); cpumask_t tlb_flush; + bool enforce_cpuid; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index dbd3152b1379..02b0ee189f82 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1853,6 +1853,27 @@ void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.hyperv_enabled = false; } +int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) +{ + struct kvm_vcpu_hv *hv_vcpu; + int ret = 0; + + if (!to_hv_vcpu(vcpu)) { + if (enforce) { + ret = kvm_hv_vcpu_init(vcpu); + if (ret) + return ret; + } else { + return 0; + } + } + + hv_vcpu = to_hv_vcpu(vcpu); + hv_vcpu->enforce_cpuid = enforce; + + return ret; +} + bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 60547d5cb6d7..730da8537d05 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -138,6 +138,7 @@ void kvm_hv_invalidate_tsc_page(struct kvm *kvm); void kvm_hv_init_vm(struct kvm *kvm); void kvm_hv_destroy_vm(struct kvm *kvm); void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu); +int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce); int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args); int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63e48738764e..475376a97419 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3955,6 +3955,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: + case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: @@ -4878,6 +4879,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + case KVM_CAP_HYPERV_ENFORCE_CPUID: + return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; if (vcpu->arch.pv_cpuid.enforce) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 79d9c44d1ad7..792816144092 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1083,6 +1083,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SGX_ATTRIBUTE 196 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 #define KVM_CAP_PTP_KVM 198 +#define KVM_CAP_HYPERV_ENFORCE_CPUID 199 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 10d7bf1e46dc19d964f0f67d2a6d20df907742d1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:37 +0200 Subject: KVM: x86: hyper-v: Cache guest CPUID leaves determining features availability Limiting exposed Hyper-V features requires a fast way to check if the particular feature is exposed in guest visible CPUIDs or not. To aboid looping through all CPUID entries on every hypercall/MSR access cache the required leaves on CPUID update. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +++++++ arch/x86/kvm/hyperv.c | 49 ++++++++++++++++++++++++++++++++--------- 2 files changed, 47 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 556a8ec89451..95b254b5a523 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -544,6 +544,14 @@ struct kvm_vcpu_hv { DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); cpumask_t tlb_flush; bool enforce_cpuid; + struct { + u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ + u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */ + u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */ + u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ + u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ + u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ + } cpuid_cache; }; /* Xen HVM per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 02b0ee189f82..7e7928fc77ef 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -274,15 +274,10 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *entry; - - entry = kvm_find_cpuid_entry(vcpu, - HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, - 0); - if (!entry) - return false; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); - return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + return hv_vcpu->cpuid_cache.syndbg_cap_eax & + HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) @@ -1845,12 +1840,46 @@ ret_success: void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); - if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) + if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { vcpu->arch.hyperv_enabled = true; - else + } else { vcpu->arch.hyperv_enabled = false; + return; + } + + if (!to_hv_vcpu(vcpu) && kvm_hv_vcpu_init(vcpu)) + return; + + hv_vcpu = to_hv_vcpu(vcpu); + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES, 0); + if (entry) { + hv_vcpu->cpuid_cache.features_eax = entry->eax; + hv_vcpu->cpuid_cache.features_ebx = entry->ebx; + hv_vcpu->cpuid_cache.features_edx = entry->edx; + } else { + hv_vcpu->cpuid_cache.features_eax = 0; + hv_vcpu->cpuid_cache.features_ebx = 0; + hv_vcpu->cpuid_cache.features_edx = 0; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO, 0); + if (entry) { + hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; + hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; + } else { + hv_vcpu->cpuid_cache.enlightenments_eax = 0; + hv_vcpu->cpuid_cache.enlightenments_ebx = 0; + } + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0); + if (entry) + hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; + else + hv_vcpu->cpuid_cache.syndbg_cap_eax = 0; } int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) -- cgit v1.2.3 From b4128000e2c9b176a449d748dcb083c61d61cc6e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:38 +0200 Subject: KVM: x86: hyper-v: Prepare to check access to Hyper-V MSRs Introduce hv_check_msr_access() to check if the particular MSR should be accessible by guest, this will be used with KVM_CAP_HYPERV_ENFORCE_CPUID mode. No functional change intended. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7e7928fc77ef..ab8dc23f05bf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1202,12 +1202,21 @@ out_unlock: mutex_unlock(&hv->hv_lock); } + +static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) +{ + return true; +} + static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; + switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; @@ -1336,6 +1345,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; + switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); @@ -1450,6 +1462,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); + if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) + return 1; + switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; @@ -1499,6 +1514,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) + return 1; + switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; -- cgit v1.2.3 From 1561c2cb87ab39400d76998bf7be581c1e57f108 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:39 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_HYPERCALL_AVAILABLE privilege bit HV_X64_MSR_GUEST_OS_ID/HV_X64_MSR_HYPERCALL are only available to guest when HV_MSR_HYPERCALL_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ab8dc23f05bf..cb66842ccb8d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1205,6 +1205,18 @@ out_unlock: static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_HYPERCALL_AVAILABLE; + default: + break; + } + return true; } -- cgit v1.2.3 From b80a92ff81587c556da740e9073821b5c3c23b72 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:40 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_VP_RUNTIME_AVAILABLE privilege bit HV_X64_MSR_VP_RUNTIME is only available to guest when HV_MSR_VP_RUNTIME_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cb66842ccb8d..6a9eb934ffe0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1213,6 +1213,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_HYPERCALL: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_HYPERCALL_AVAILABLE; + case HV_X64_MSR_VP_RUNTIME: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_RUNTIME_AVAILABLE; default: break; } -- cgit v1.2.3 From c2b32867f2e7bfa7e7521e417ab8bbd586ac6bcc Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:41 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_TIME_REF_COUNT_AVAILABLE privilege bit HV_X64_MSR_TIME_REF_COUNT is only available to guest when HV_MSR_TIME_REF_COUNT_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a9eb934ffe0..c90679247185 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1216,6 +1216,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_RUNTIME: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_RUNTIME_AVAILABLE; + case HV_X64_MSR_TIME_REF_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_TIME_REF_COUNT_AVAILABLE; default: break; } -- cgit v1.2.3 From d2ac25d4196da2ff404c88bec480c835995ea69c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:42 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_VP_INDEX_AVAILABLE privilege bit HV_X64_MSR_VP_INDEX is only available to guest when HV_MSR_VP_INDEX_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-9-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c90679247185..fb5ed867b53c 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1219,6 +1219,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TIME_REF_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_TIME_REF_COUNT_AVAILABLE; + case HV_X64_MSR_VP_INDEX: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_VP_INDEX_AVAILABLE; default: break; } -- cgit v1.2.3 From 679008e4bbeb12f4905ee0820cd2d0b9d4a21dbb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:43 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_RESET_AVAILABLE privilege bit HV_X64_MSR_RESET is only available to guest when HV_MSR_RESET_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-10-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index fb5ed867b53c..1348f7691310 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1222,6 +1222,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_VP_INDEX: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_INDEX_AVAILABLE; + case HV_X64_MSR_RESET: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_RESET_AVAILABLE; default: break; } -- cgit v1.2.3 From a1ec661c3fdc8177a8789a9528d5bcfe0d9fc8a8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:44 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_REFERENCE_TSC_AVAILABLE privilege bit HV_X64_MSR_REFERENCE_TSC is only available to guest when HV_MSR_REFERENCE_TSC_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-11-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1348f7691310..7ca7ea0b6e74 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1225,6 +1225,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_RESET: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_RESET_AVAILABLE; + case HV_X64_MSR_REFERENCE_TSC: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_REFERENCE_TSC_AVAILABLE; default: break; } -- cgit v1.2.3 From 9e2715ca20d7b540a271464b3ac862cf387935c1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:45 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_SYNIC_AVAILABLE privilege bit SynIC MSRs (HV_X64_MSR_SCONTROL, HV_X64_MSR_SVERSION, HV_X64_MSR_SIEFP, HV_X64_MSR_SIMP, HV_X64_MSR_EOM, HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15) are only available to guest when HV_MSR_SYNIC_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-12-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7ca7ea0b6e74..9d3aed3bebcd 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1228,6 +1228,14 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_REFERENCE_TSC: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_REFERENCE_TSC_AVAILABLE; + case HV_X64_MSR_SCONTROL: + case HV_X64_MSR_SVERSION: + case HV_X64_MSR_SIEFP: + case HV_X64_MSR_SIMP: + case HV_X64_MSR_EOM: + case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNIC_AVAILABLE; default: break; } -- cgit v1.2.3 From eba60ddae794bdefb9531cb08e30c19a0bc53c15 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:46 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_SYNTIMER_AVAILABLE privilege bit Synthetic timers MSRs (HV_X64_MSR_STIMER[0-3]_CONFIG, HV_X64_MSR_STIMER[0-3]_COUNT) are only available to guest when HV_MSR_SYNTIMER_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-13-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 9d3aed3bebcd..787fd58593dd 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1236,6 +1236,16 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNIC_AVAILABLE; + case HV_X64_MSR_STIMER0_CONFIG: + case HV_X64_MSR_STIMER1_CONFIG: + case HV_X64_MSR_STIMER2_CONFIG: + case HV_X64_MSR_STIMER3_CONFIG: + case HV_X64_MSR_STIMER0_COUNT: + case HV_X64_MSR_STIMER1_COUNT: + case HV_X64_MSR_STIMER2_COUNT: + case HV_X64_MSR_STIMER3_COUNT: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_SYNTIMER_AVAILABLE; default: break; } -- cgit v1.2.3 From 978b57475c7795824676122acb75a1dea264b6d1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:47 +0200 Subject: KVM: x86: hyper-v: Honor HV_MSR_APIC_ACCESS_AVAILABLE privilege bit HV_X64_MSR_EOI, HV_X64_MSR_ICR, HV_X64_MSR_TPR, and HV_X64_MSR_VP_ASSIST_PAGE are only available to guest when HV_MSR_APIC_ACCESS_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-14-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 787fd58593dd..a168b72334cc 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1246,6 +1246,13 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_STIMER3_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNTIMER_AVAILABLE; + case HV_X64_MSR_EOI: + case HV_X64_MSR_ICR: + case HV_X64_MSR_TPR: + case HV_X64_MSR_VP_ASSIST_PAGE: + return hv_vcpu->cpuid_cache.features_eax & + HV_MSR_APIC_ACCESS_AVAILABLE; + break; default: break; } -- cgit v1.2.3 From 9442f3bd9012f37ba2b4ec3ab2d7c248b137391c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:48 +0200 Subject: KVM: x86: hyper-v: Honor HV_ACCESS_FREQUENCY_MSRS privilege bit HV_X64_MSR_TSC_FREQUENCY/HV_X64_MSR_APIC_FREQUENCY are only available to guest when HV_ACCESS_FREQUENCY_MSRS bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-15-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a168b72334cc..2a0660b4e779 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1253,6 +1253,10 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; break; + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_FREQUENCY_MSRS; default: break; } -- cgit v1.2.3 From 234d01baec5b216b60b560672957470f773ecf78 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:49 +0200 Subject: KVM: x86: hyper-v: Honor HV_ACCESS_REENLIGHTENMENT privilege bit HV_X64_MSR_REENLIGHTENMENT_CONTROL/HV_X64_MSR_TSC_EMULATION_CONTROL/ HV_X64_MSR_TSC_EMULATION_STATUS are only available to guest when HV_ACCESS_REENLIGHTENMENT bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-16-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 2a0660b4e779..230f52606e39 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1257,6 +1257,11 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_FREQUENCY_MSRS; + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + return hv_vcpu->cpuid_cache.features_eax & + HV_ACCESS_REENLIGHTENMENT; default: break; } -- cgit v1.2.3 From 0a19c8992db834c9c9e28c5633720d994629539d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:50 +0200 Subject: KVM: x86: hyper-v: Honor HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE privilege bit HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL are only available to guest when HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-17-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 230f52606e39..7b7da057b54b 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1262,6 +1262,10 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; default: break; } -- cgit v1.2.3 From 17b6d51771a15c7d8552c3e855b5862b3dce0977 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:51 +0200 Subject: KVM: x86: hyper-v: Honor HV_FEATURE_DEBUG_MSRS_AVAILABLE privilege bit Synthetic debugging MSRs (HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, HV_X64_MSR_SYNDBG_PENDING_BUFFER, HV_X64_MSR_SYNDBG_OPTIONS) are only available to guest when HV_FEATURE_DEBUG_MSRS_AVAILABLE bit is exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-18-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7b7da057b54b..3bf00a9299dd 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1266,6 +1266,10 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return hv_vcpu->cpuid_cache.features_edx & + HV_FEATURE_DEBUG_MSRS_AVAILABLE; default: break; } -- cgit v1.2.3 From d66bfa36f9edc5ca8c83206ab39d09091623104e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:52 +0200 Subject: KVM: x86: hyper-v: Inverse the default in hv_check_msr_access() Access to all MSRs is now properly checked. To avoid 'forgetting' to properly check access to new MSRs in the future change the default to 'false' meaning 'no access'. No functional change intended. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-19-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3bf00a9299dd..db735692fc62 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1274,7 +1274,7 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) break; } - return true; + return false; } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, -- cgit v1.2.3 From 1aa8a4184dbde5f50b70b2c706bcfb6b57da9ea7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:53 +0200 Subject: KVM: x86: hyper-v: Honor HV_STIMER_DIRECT_MODE_AVAILABLE privilege bit Synthetic timers can only be configured in 'direct' mode when HV_STIMER_DIRECT_MODE_AVAILABLE bit was exposed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-20-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index db735692fc62..1c7030311885 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -631,11 +631,17 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && !host) return 1; + if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && + !(hv_vcpu->cpuid_cache.features_edx & + HV_STIMER_DIRECT_MODE_AVAILABLE))) + return 1; + trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); -- cgit v1.2.3 From 4ad81a91119df7c0e868f9e4c82b9159645bc906 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:54 +0200 Subject: KVM: x86: hyper-v: Prepare to check access to Hyper-V hypercalls Introduce hv_check_hypercallr_access() to check if the particular hypercall should be available to guest, this will be used with KVM_CAP_HYPERV_ENFORCE_CPUID mode. No functional change intended. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-21-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1c7030311885..51fc74ea773f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2090,6 +2090,11 @@ static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) kvm_fpu_put(); } +static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) +{ + return true; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_hv_hcall hc; @@ -2132,6 +2137,11 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); + if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) { + ret = HV_STATUS_ACCESS_DENIED; + goto hypercall_complete; + } + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep)) { @@ -2238,6 +2248,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) break; } +hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); } -- cgit v1.2.3 From 34ef7d7b9c0422316ee2c34c564b222255c91532 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:55 +0200 Subject: KVM: x86: hyper-v: Check access to HVCALL_NOTIFY_LONG_SPIN_WAIT hypercall TLFS6.0b states that partition issuing HVCALL_NOTIFY_LONG_SPIN_WAIT must posess 'UseHypercallForLongSpinWait' privilege but there's no corresponding feature bit. Instead, we have "Recommended number of attempts to retry a spinlock failure before notifying the hypervisor about the failures. 0xFFFFFFFF indicates never notify." Use this to check access to the hypercall. Also, check against zero as the corresponding CPUID must be set (and '0' attempts before re-try is weird anyway). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-22-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 51fc74ea773f..13bfa4e0b93d 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2092,6 +2092,17 @@ static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) { + if (!hv_vcpu->enforce_cpuid) + return true; + + switch (code) { + case HVCALL_NOTIFY_LONG_SPIN_WAIT: + return hv_vcpu->cpuid_cache.enlightenments_ebx && + hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; + default: + break; + } + return true; } -- cgit v1.2.3 From 4f532b7f969fcba010703fe21e0a85f662373041 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:56 +0200 Subject: KVM: x86: hyper-v: Honor HV_POST_MESSAGES privilege bit Hyper-V partition must possess 'HV_POST_MESSAGES' privilege to issue HVCALL_POST_MESSAGE hypercalls. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-23-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 13bfa4e0b93d..293619998c38 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2099,6 +2099,8 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_NOTIFY_LONG_SPIN_WAIT: return hv_vcpu->cpuid_cache.enlightenments_ebx && hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; + case HVCALL_POST_MESSAGE: + return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; default: break; } -- cgit v1.2.3 From a60b3c594ef3221275d4fa8aa94e206607ea66f3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:57 +0200 Subject: KVM: x86: hyper-v: Honor HV_SIGNAL_EVENTS privilege bit Hyper-V partition must possess 'HV_SIGNAL_EVENTS' privilege to issue HVCALL_SIGNAL_EVENT hypercalls. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-24-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 293619998c38..ce057827da03 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2101,6 +2101,8 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; case HVCALL_POST_MESSAGE: return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; + case HVCALL_SIGNAL_EVENT: + return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; default: break; } -- cgit v1.2.3 From a921cf83cc4c927f29eef1e7a17bff176c74b886 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:58 +0200 Subject: KVM: x86: hyper-v: Honor HV_DEBUGGING privilege bit Hyper-V partition must possess 'HV_DEBUGGING' privilege to issue HVCALL_POST_DEBUG_DATA/HVCALL_RETRIEVE_DEBUG_DATA/ HVCALL_RESET_DEBUG_SESSION hypercalls. Note, when SynDBG is disabled hv_check_hypercall_access() returns 'true' (like for any other unknown hypercall) so the result will be HV_STATUS_INVALID_HYPERCALL_CODE and not HV_STATUS_ACCESS_DENIED. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-25-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ce057827da03..3d6b448ab18f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2103,6 +2103,15 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; case HVCALL_SIGNAL_EVENT: return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; + case HVCALL_POST_DEBUG_DATA: + case HVCALL_RETRIEVE_DEBUG_DATA: + case HVCALL_RESET_DEBUG_SESSION: + /* + * Return 'true' when SynDBG is disabled so the resulting code + * will be HV_STATUS_INVALID_HYPERCALL_CODE. + */ + return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || + hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; default: break; } -- cgit v1.2.3 From bb53ecb4d6ea453e55a971295e55dbf76adc0f8c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:51:59 +0200 Subject: KVM: x86: hyper-v: Honor HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED bit Hyper-V partition must possess 'HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED' privilege ('recommended' is rather a misnomer) to issue HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST/SPACE hypercalls. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-26-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 3d6b448ab18f..831279976d9f 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2112,6 +2112,12 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) */ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: + case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; default: break; } -- cgit v1.2.3 From d264eb3c14d0e5df49ecab3e8b51caadf78abefa Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:52:00 +0200 Subject: KVM: x86: hyper-v: Honor HV_X64_CLUSTER_IPI_RECOMMENDED bit Hyper-V partition must possess 'HV_X64_CLUSTER_IPI_RECOMMENDED' privilege ('recommended' is rather a misnomer) to issue HVCALL_SEND_IPI hypercalls. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-27-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 831279976d9f..eded585620a7 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2118,6 +2118,10 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + case HVCALL_SEND_IPI_EX: + case HVCALL_SEND_IPI: + return hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_CLUSTER_IPI_RECOMMENDED; default: break; } -- cgit v1.2.3 From 445caed0213acef29b9d3822b6906f99860ca9ab Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 21 May 2021 11:52:01 +0200 Subject: KVM: x86: hyper-v: Honor HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED bit Hypercalls which use extended processor masks are only available when HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED privilege bit is exposed (and 'RECOMMENDED' is rather a misnomer). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210521095204.2161214-28-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index eded585620a7..4f911dca7dd6 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2114,11 +2114,19 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; case HVCALL_SEND_IPI_EX: + if (!(hv_vcpu->cpuid_cache.enlightenments_eax & + HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + return false; + fallthrough; case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; -- cgit v1.2.3 From bcb72d0627e8a3e531021c9bd2a14fae8da63ef3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 12:01:56 +0300 Subject: KVM: nVMX: Drop obsolete (and pointless) pdptrs_changed() check Remove the pdptrs_changed() check when loading L2's CR3. The set of available registers is always reset when switching VMCSes (see commit e5d03de5937e, "KVM: nVMX: Reset register cache (available and dirty masks) on VMCS switch"), thus the "are PDPTRs available" check will always fail. And even if it didn't fail, reading guest memory to check the PDPTRs is just as expensive as reading guest memory to load 'em. Signed-off-by: Sean Christopherson Message-Id: <20210607090203.133058-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e77b8ee28df8..47f4aa609778 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1118,11 +1118,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * must not be dereferenced. */ if (!nested_ept && is_pae_paging(vcpu) && - (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { - *entry_failure_code = ENTRY_FAIL_PDPTE; - return -EINVAL; - } + CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return -EINVAL; } /* -- cgit v1.2.3 From a36dbec67e26febc1fc551f4819e3c058b25e79c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 12:01:57 +0300 Subject: KVM: nSVM: Drop pointless pdptrs_changed() check on nested transition Remove the "PDPTRs unchanged" check to skip PDPTR loading during nested SVM transitions as it's not at all an optimization. Reading guest memory to get the PDPTRs isn't magically cheaper by doing it in pdptrs_changed(), and if the PDPTRs did change, KVM will end up doing the read twice. Signed-off-by: Sean Christopherson Message-Id: <20210607090203.133058-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 34fc74b0d58a..f0a7f8432527 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -391,10 +391,8 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (!nested_npt && is_pae_paging(vcpu) && - (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { - if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) - return -EINVAL; - } + CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) + return -EINVAL; /* * TODO: optimize unconditional TLB flush/MMU sync here and in -- cgit v1.2.3 From c7313155bf11906ad75ae0edc4a97bf93d69c275 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 7 Jun 2021 12:01:58 +0300 Subject: KVM: x86: Always load PDPTRs on CR3 load for SVM w/o NPT and a PAE guest Kill off pdptrs_changed() and instead go through the full kvm_set_cr3() for PAE guest, even if the new CR3 is the same as the current CR3. For VMX, and SVM with NPT enabled, the PDPTRs are unconditionally marked as unavailable after VM-Exit, i.e. the optimization is dead code except for SVM without NPT. In the unlikely scenario that anyone cares about SVM without NPT _and_ a PAE guest, they've got bigger problems if their guest is loading the same CR3 so frequently that the performance of kvm_set_cr3() is notable, especially since KVM's fast PGD switching means reloading the same CR3 does not require a full rebuild. Given that PAE and PCID are mutually exclusive, i.e. a sync and flush are guaranteed in any case, the actual benefits of the pdptrs_changed() optimization are marginal at best. Signed-off-by: Sean Christopherson Message-Id: <20210607090203.133058-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 34 ++-------------------------------- 2 files changed, 2 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 95b254b5a523..601e00876b38 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1506,7 +1506,6 @@ unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); -bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 475376a97419..188c180d9f6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -783,13 +783,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); -static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -831,30 +824,6 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -bool pdptrs_changed(struct kvm_vcpu *vcpu) -{ - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - int offset; - gfn_t gfn; - int r; - - if (!is_pae_paging(vcpu)) - return false; - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - return true; - - gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); - if (r < 0) - return true; - - return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -} -EXPORT_SYMBOL_GPL(pdptrs_changed); - void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; @@ -1101,7 +1070,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } #endif - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { + /* PDPTRs are always reloaded for PAE paging. */ + if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); -- cgit v1.2.3 From b222b0b88162bdef4eceb12a79d5edbbdb23dbfd Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Jun 2021 12:01:59 +0300 Subject: KVM: nSVM: refactor the CR3 reload on migration Document the actual reason why we need to do it on migration and move the call to svm_set_nested_state to be closer to VMX code. To avoid loading the PDPTRs from possibly not up to date memory map, in nested_svm_load_cr3 after the move, move this code to .get_nested_state_pages. Signed-off-by: Maxim Levitsky Message-Id: <20210607090203.133058-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f0a7f8432527..e917eba659b3 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -385,12 +385,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) * if we are emulating VM-Entry into a guest with NPT enabled. */ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, - bool nested_npt) + bool nested_npt, bool reload_pdptrs) { if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) return -EINVAL; - if (!nested_npt && is_pae_paging(vcpu) && + if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) && CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) return -EINVAL; @@ -574,7 +574,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_vmcb02_prepare_save(svm, vmcb12); ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3, - nested_npt_enabled(svm)); + nested_npt_enabled(svm), true); if (ret) return ret; @@ -801,7 +801,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false); + rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); if (rc) return 1; @@ -1297,6 +1297,19 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, !nested_vmcb_valid_sregs(vcpu, save)) goto out_free; + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + /* * All checks done, we can enter guest mode. Userspace provides * vmcb12.control, which will be combined with L1 and stored into @@ -1354,9 +1367,14 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (WARN_ON(!is_guest_mode(vcpu))) return true; - if (nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm))) - return false; + if (!nested_npt_enabled(svm) && is_pae_paging(vcpu)) + /* + * Reload the guest's PDPTRs since after a migration + * the guest CR3 might be restored prior to setting the nested + * state which can lead to a load of wrong PDPTRs. + */ + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + return false; if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -- cgit v1.2.3 From 0f85722341b0e3a67d0f2d2ae943b9193cb3e1b0 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Jun 2021 12:02:00 +0300 Subject: KVM: nVMX: delay loading of PDPTRs to KVM_REQ_GET_NESTED_STATE_PAGES Similar to the rest of guest page accesses after a migration, this access should be delayed to KVM_REQ_GET_NESTED_STATE_PAGES. Signed-off-by: Maxim Levitsky Message-Id: <20210607090203.133058-6-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 47f4aa609778..ac306678afcc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1105,7 +1105,8 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to * @entry_failure_code. */ -static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { @@ -1117,7 +1118,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * If PAE paging and EPT are both on, CR3 is not used by the CPU and * must not be dereferenced. */ - if (!nested_ept && is_pae_paging(vcpu) && + if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { *entry_failure_code = ENTRY_FAIL_PDPTE; return -EINVAL; @@ -2487,6 +2488,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2579,7 +2581,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), - entry_failure_code)) + from_vmentry, entry_failure_code)) return -EINVAL; /* @@ -3120,6 +3122,17 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; + if (!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + /* + * Reload the guest's PDPTRs since after a migration + * the guest CR3 might be restored prior to setting the nested + * state which can lead to a load of wrong PDPTRs. + */ + if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) + return false; + } + + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3371,7 +3384,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, enter_guest_mode(vcpu); - if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { exit_reason.basic = EXIT_REASON_INVALID_STATE; vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; @@ -4226,7 +4239,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); -- cgit v1.2.3 From 329675dde93c6f30009dc413197bdf2b971f1e5e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Jun 2021 12:02:01 +0300 Subject: KVM: x86: introduce kvm_register_clear_available Small refactoring that will be used in the next patch. Signed-off-by: Maxim Levitsky Message-Id: <20210607090203.133058-7-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm_cache_regs.h | 7 +++++++ arch/x86/kvm/svm/svm.c | 6 ++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index ebddbd37a0bf..296d67f689ef 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -55,6 +55,13 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); } +static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, enum kvm_reg reg) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9bb4692728ef..b6afa6b63c8f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3873,10 +3873,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); - if (npt_enabled) { - vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); - vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); - } + if (npt_enabled) + kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR); /* * We need to handle MC intercepts here before the vcpu has a chance to -- cgit v1.2.3 From 6dba940352038b56db9b591b172fb2ec76a5fd5e Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Jun 2021 12:02:02 +0300 Subject: KVM: x86: Introduce KVM_GET_SREGS2 / KVM_SET_SREGS2 This is a new version of KVM_GET_SREGS / KVM_SET_SREGS. It has the following changes: * Has flags for future extensions * Has vcpu's PDPTRs, allowing to save/restore them on migration. * Lacks obsolete interrupt bitmap (done now via KVM_SET_VCPU_EVENTS) New capability, KVM_CAP_SREGS2 is added to signal the userspace of this ioctl. Signed-off-by: Maxim Levitsky Message-Id: <20210607090203.133058-8-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 48 ++++++++++++++ arch/x86/include/uapi/asm/kvm.h | 13 ++++ arch/x86/kvm/kvm_cache_regs.h | 5 ++ arch/x86/kvm/x86.c | 142 ++++++++++++++++++++++++++++++++-------- include/uapi/linux/kvm.h | 4 ++ 5 files changed, 185 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 80154d5d98a1..cded99561adf 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5034,6 +5034,54 @@ see KVM_XEN_VCPU_SET_ATTR above. The KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST type may not be used with the KVM_XEN_VCPU_GET_ATTR ioctl. + +4.131 KVM_GET_SREGS2 +------------------ + +:Capability: KVM_CAP_SREGS2 +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_sregs2 (out) +:Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. +This ioctl (when supported) replaces the KVM_GET_SREGS. + +:: + +struct kvm_sregs2 { + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 flags; + __u64 pdptrs[4]; +}; + +flags values for ``kvm_sregs2``: + +``KVM_SREGS2_FLAGS_PDPTRS_VALID`` + + Indicates thats the struct contain valid PDPTR values. + + +4.132 KVM_SET_SREGS2 +------------------ + +:Capability: KVM_CAP_SREGS2 +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_sregs2 (in) +:Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. +See KVM_GET_SREGS2 for the data structures. +This ioctl (when supported) replaces the KVM_SET_SREGS. + + 5. The kvm_run structure ======================== diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0662f644aad9..a6c327f8ad9e 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -159,6 +159,19 @@ struct kvm_sregs { __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; }; +struct kvm_sregs2 { + /* out (KVM_GET_SREGS2) / in (KVM_SET_SREGS2) */ + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 flags; + __u64 pdptrs[4]; +}; +#define KVM_SREGS2_FLAGS_PDPTRS_VALID 1 + /* for KVM_GET_FPU and KVM_SET_FPU */ struct kvm_fpu { __u8 fpr[8][16]; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 296d67f689ef..90e1ffdc05b7 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -125,6 +125,11 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } +static inline void kvm_pdptr_write(struct kvm_vcpu *vcpu, int index, u64 value) +{ + vcpu->arch.walk_mmu->pdptrs[index] = value; +} + static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 188c180d9f6e..8085ab830c80 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -114,6 +114,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); + struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -817,7 +820,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); - out: return ret; @@ -3956,6 +3958,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_SREGS2: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -4870,6 +4873,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; union { + struct kvm_sregs2 *sregs2; struct kvm_lapic_state *lapic; struct kvm_xsave *xsave; struct kvm_xcrs *xcrs; @@ -5242,6 +5246,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } #endif + case KVM_GET_SREGS2: { + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); + r = -ENOMEM; + if (!u.sregs2) + goto out; + __get_sregs2(vcpu, u.sregs2); + r = -EFAULT; + if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) + goto out; + r = 0; + break; + } + case KVM_SET_SREGS2: { + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); + if (IS_ERR(u.sregs2)) { + r = PTR_ERR(u.sregs2); + u.sregs2 = NULL; + goto out; + } + r = __set_sregs2(vcpu, u.sregs2); + break; + } default: r = -EINVAL; } @@ -9937,7 +9963,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) } EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -9970,14 +9996,36 @@ skip_protected_regs: sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); +} - memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + __get_sregs_common(vcpu, sregs); + + if (vcpu->arch.guest_state_protected) + return; if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); } +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int i; + + __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); + + if (vcpu->arch.guest_state_protected) + return; + + if (is_pae_paging(vcpu)) { + for (i = 0 ; i < 4 ; i++) + sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); + sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + } +} + int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -10096,24 +10144,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return kvm_is_valid_cr4(vcpu, sregs->cr4); } -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, + int *mmu_reset_needed, bool update_pdptrs) { struct msr_data apic_base_msr; - int mmu_reset_needed = 0; - int pending_vec, max_bits, idx; + int idx; struct desc_ptr dt; - int ret = -EINVAL; if (!kvm_is_valid_sregs(vcpu, sregs)) - goto out; + return -EINVAL; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) - goto out; + return -EINVAL; if (vcpu->arch.guest_state_protected) - goto skip_protected_regs; + return 0; dt.size = sregs->idt.limit; dt.address = sregs->idt.base; @@ -10123,31 +10170,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) static_call(kvm_x86_set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; static_call(kvm_x86_set_efer)(vcpu, sregs->efer); - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); - mmu_reset_needed = 1; + if (update_pdptrs) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (is_pae_paging(vcpu)) { + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + *mmu_reset_needed = 1; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); @@ -10167,20 +10213,62 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -skip_protected_regs: + return 0; +} + +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int pending_vec, max_bits; + int mmu_reset_needed = 0; + int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); + + if (ret) + return ret; + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( (const unsigned long *)sregs->interrupt_bitmap, max_bits); + if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); + kvm_make_request(KVM_REQ_EVENT, vcpu); } + return 0; +} - kvm_make_request(KVM_REQ_EVENT, vcpu); +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int mmu_reset_needed = 0; + bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; + bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && + !(sregs2->efer & EFER_LMA); + int i, ret; - ret = 0; -out: - return ret; + if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) + return -EINVAL; + + if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) + return -EINVAL; + + ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, + &mmu_reset_needed, !valid_pdptrs); + if (ret) + return ret; + + if (valid_pdptrs) { + for (i = 0; i < 4 ; i++) + kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); + + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + mmu_reset_needed = 1; + } + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 792816144092..90d44138dbfb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1084,6 +1084,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 #define KVM_CAP_PTP_KVM 198 #define KVM_CAP_HYPERV_ENFORCE_CPUID 199 +#define KVM_CAP_SREGS2 200 #ifdef KVM_CAP_IRQ_ROUTING @@ -1622,6 +1623,9 @@ struct kvm_xen_hvm_attr { #define KVM_XEN_VCPU_GET_ATTR _IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr) #define KVM_XEN_VCPU_SET_ATTR _IOW(KVMIO, 0xcb, struct kvm_xen_vcpu_attr) +#define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) +#define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2) + struct kvm_xen_vcpu_attr { __u16 type; __u16 pad[3]; -- cgit v1.2.3 From 158a48ecf776d0ebc916befcb0dc0862f136a31f Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 7 Jun 2021 12:02:03 +0300 Subject: KVM: x86: avoid loading PDPTRs after migration when possible if new KVM_*_SREGS2 ioctls are used, the PDPTRs are a part of the migration state and are correctly restored by those ioctls. Signed-off-by: Maxim Levitsky Message-Id: <20210607090203.133058-9-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/svm/nested.c | 3 ++- arch/x86/kvm/vmx/nested.c | 3 ++- arch/x86/kvm/x86.c | 3 +++ 4 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 601e00876b38..383106901fe2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -862,6 +862,12 @@ struct kvm_vcpu_arch { /* Protected Guests */ bool guest_state_protected; + /* + * Set when PDPTS were loaded directly by the userspace without + * reading the guest memory + */ + bool pdptrs_from_userspace; + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e917eba659b3..c902ace2bd17 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1367,7 +1367,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) if (WARN_ON(!is_guest_mode(vcpu))) return true; - if (!nested_npt_enabled(svm) && is_pae_paging(vcpu)) + if (!vcpu->arch.pdptrs_from_userspace && + !nested_npt_enabled(svm) && is_pae_paging(vcpu)) /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ac306678afcc..1a2f000a5dae 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3122,7 +3122,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; - if (!nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { + if (!vcpu->arch.pdptrs_from_userspace && + !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { /* * Reload the guest's PDPTRs since after a migration * the guest CR3 might be restored prior to setting the nested diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8085ab830c80..1727178b8961 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -820,6 +820,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + vcpu->arch.pdptrs_from_userspace = false; + out: return ret; @@ -10265,6 +10267,7 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); mmu_reset_needed = 1; + vcpu->arch.pdptrs_from_userspace = true; } if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); -- cgit v1.2.3 From 1e9dfbd748f37dfa51fcdc82a7afddde1cf8d0ed Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:16 +0200 Subject: KVM: nVMX: Use '-1' in 'hv_evmcs_vmptr' to indicate that eVMCS is not in use Instead of checking 'vmx->nested.hv_evmcs' use '-1' in 'vmx->nested.hv_evmcs_vmptr' to indicate 'evmcs is not in use' state. This matches how we check 'vmx->nested.current_vmptr'. Introduce EVMPTR_INVALID and evmptr_is_valid() and use it instead of raw '-1' check as a preparation to adding other 'special' values. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 3 +++ arch/x86/kvm/vmx/evmcs.h | 7 ++++++ arch/x86/kvm/vmx/nested.c | 55 ++++++++++++++++++++++++----------------------- arch/x86/kvm/vmx/nested.h | 2 +- arch/x86/kvm/vmx/vmx.c | 1 + 5 files changed, 40 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 41f24661af04..896b2a50b4aa 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -319,6 +319,9 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) if (unlikely(!assist_page.enlighten_vmentry)) return false; + if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs))) + return false; + *evmcs_gpa = assist_page.current_nested_vmcs; return true; diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index bd41d9462355..47f802f71f6a 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -197,6 +197,13 @@ static inline void evmcs_load(u64 phys_addr) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +#define EVMPTR_INVALID (-1ULL) + +static inline bool evmptr_is_valid(u64 evmptr) +{ + return evmptr != EVMPTR_INVALID; +} + enum nested_evmptrld_status { EVMPTRLD_DISABLED, EVMPTRLD_SUCCEEDED, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a2f000a5dae..84d9a8d569bb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -187,7 +187,8 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + if (vmx->nested.current_vmptr == -1ull && + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); @@ -221,12 +222,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->nested.hv_evmcs) - return; + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + vmx->nested.hv_evmcs = NULL; + } - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs_vmptr = 0; - vmx->nested.hv_evmcs = NULL; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -1981,10 +1982,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) return EVMPTRLD_DISABLED; - if (unlikely(!vmx->nested.hv_evmcs || - evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - if (!vmx->nested.hv_evmcs) - vmx->nested.current_vmptr = -1ull; + if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { + vmx->nested.current_vmptr = -1ull; nested_release_evmcs(vcpu); @@ -2055,7 +2054,7 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.hv_evmcs) { + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { copy_vmcs12_to_enlightened(vmx); /* All fields are clean */ vmx->nested.hv_evmcs->hv_clean_fields |= @@ -2207,7 +2206,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) prepare_vmcs02_early_rare(vmx, vmcs12); /* @@ -2492,15 +2491,14 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || hv_evmcs) { + if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - load_guest_pdptrs_vmcs12 = !hv_evmcs || - !(hv_evmcs->hv_clean_fields & + load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || + !(vmx->nested.hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } @@ -3102,7 +3100,8 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { + if (vmx->nested.enlightened_vmcs_enabled && + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3465,7 +3464,7 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; - if (enable_shadow_vmcs || vmx->nested.hv_evmcs) + if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } @@ -3493,7 +3492,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return nested_vmx_failInvalid(vcpu); } - if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) + if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + vmx->nested.current_vmptr == -1ull)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -3507,7 +3507,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); - if (vmx->nested.hv_evmcs) { + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { copy_enlightened_to_vmcs12(vmx); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; @@ -4066,10 +4066,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); - vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; + vmx->nested.need_sync_vmcs02_to_vmcs12_rare = + !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -4569,7 +4570,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, } if ((vm_exit_reason != -1) && - (enable_shadow_vmcs || vmx->nested.hv_evmcs)) + (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ @@ -5265,7 +5266,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return 1; if (vmx->nested.current_vmptr != vmptr) { @@ -5321,7 +5322,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, @@ -6093,7 +6094,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6149,7 +6150,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (vmx->nested.hv_evmcs) + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) copy_enlightened_to_vmcs12(vmx); else if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 184418baeb3c..c4397e83614d 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -63,7 +63,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) * have vmcs12 if it is true. */ return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || - vmx->nested.hv_evmcs; + evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e3f744bec763..68a72c80bd3f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6861,6 +6861,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; -- cgit v1.2.3 From 6a789ca5d5038a60f51c374067fd9abab13df596 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:17 +0200 Subject: KVM: nVMX: Don't set 'dirty_vmcs12' flag on enlightened VMPTRLD 'dirty_vmcs12' is only checked in prepare_vmcs02_early()/prepare_vmcs02() and both checks look like: 'vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)' so for eVMCS case the flag changes nothing. Drop the assignment to avoid the confusion. No functional change intended. Reported-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 84d9a8d569bb..dbee5479103f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2021,7 +2021,6 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( return EVMPTRLD_VMFAIL; } - vmx->nested.dirty_vmcs12 = true; vmx->nested.hv_evmcs_vmptr = evmcs_gpa; evmcs_gpa_changed = true; -- cgit v1.2.3 From 02761716801dbc99d977bb281de7c1052405c9f5 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:18 +0200 Subject: KVM: nVMX: Release eVMCS when enlightened VMENTRY was disabled In theory, L1 can try to disable enlightened VMENTRY in VP assist page and try to issue VMLAUNCH/VMRESUME. While nested_vmx_handle_enlightened_vmptrld() properly handles this as 'EVMPTRLD_DISABLED', previously mapped eVMCS remains mapped and thus all evmptr_is_valid() checks will still pass and nested_vmx_run() will proceed when it shouldn't. Release eVMCS immediately when we detect that enlightened vmentry was disabled by L1. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index dbee5479103f..8d814bf8448e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1979,8 +1979,10 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( if (likely(!vmx->nested.enlightened_vmcs_enabled)) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) + if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; + } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { vmx->nested.current_vmptr = -1ull; -- cgit v1.2.3 From 25641cafabc6dcc0a2d32dbbfd8fc448513b339d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:19 +0200 Subject: KVM: nVMX: Make copy_vmcs12_to_enlightened()/copy_enlightened_to_vmcs12() return 'void' copy_vmcs12_to_enlightened()/copy_enlightened_to_vmcs12() don't return any result, make them return 'void'. No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8d814bf8448e..e72a637658f1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1586,7 +1586,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } -static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -1799,10 +1799,10 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; */ - return 0; + return; } -static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -1962,7 +1962,7 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; - return 0; + return; } /* -- cgit v1.2.3 From 278499686b18e9012ddefbe0ecabc83e6c0264fe Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:20 +0200 Subject: KVM: nVMX: Introduce 'EVMPTR_MAP_PENDING' post-migration state Unlike regular set_current_vmptr(), nested_vmx_handle_enlightened_vmptrld() can not be called directly from vmx_set_nested_state() as KVM may not have all the information yet (e.g. HV_X64_MSR_VP_ASSIST_PAGE MSR may not be restored yet). Enlightened VMCS is mapped later while getting nested state pages. In the meantime, vmx->nested.hv_evmcs_vmptr remains 'EVMPTR_INVALID' and it's indistinguishable from 'evmcs is not in use' case. This leads to certain issues, in particular, if KVM_GET_NESTED_STATE is called right after KVM_SET_NESTED_STATE, KVM_STATE_NESTED_EVMCS flag in the resulting state will be unset (and such state will later fail to load). Introduce 'EVMPTR_MAP_PENDING' state to detect not-yet-mapped eVMCS after restore. With this, the 'is_guest_mode(vcpu)' hack in vmx_has_valid_vmcs12() is no longer needed. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.h | 3 ++- arch/x86/kvm/vmx/nested.c | 6 ++++-- arch/x86/kvm/vmx/nested.h | 11 +++-------- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 47f802f71f6a..2ec9b46f0d0c 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -198,10 +198,11 @@ static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ #define EVMPTR_INVALID (-1ULL) +#define EVMPTR_MAP_PENDING (-2ULL) static inline bool evmptr_is_valid(u64 evmptr) { - return evmptr != EVMPTR_INVALID; + return evmptr != EVMPTR_INVALID && evmptr != EVMPTR_MAP_PENDING; } enum nested_evmptrld_status { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e72a637658f1..46de0147ca01 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3102,7 +3102,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * properly reflected. */ if (vmx->nested.enlightened_vmcs_enabled && - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -6095,7 +6095,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx_has_valid_vmcs12(vcpu)) { kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6294,6 +6295,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * restored yet. EVMCS will be mapped from * nested_get_vmcs12_pages(). */ + vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); } else { return -EINVAL; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index c4397e83614d..b69a80f43b37 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -56,14 +56,9 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * In case we do two consecutive get/set_nested_state()s while L2 was - * running hv_evmcs may end up not being mapped (we map it from - * nested_vmx_run()/vmx_vcpu_run()). Check is_guest_mode() as we always - * have vmcs12 if it is true. - */ - return is_guest_mode(vcpu) || vmx->nested.current_vmptr != -1ull || - evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); + /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ + return vmx->nested.current_vmptr != -1ull || + vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID; } static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3b19b81acf300a3d452aa07b21d8db528254cb56 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:21 +0200 Subject: KVM: nVMX: Release enlightened VMCS on VMCLEAR Unlike VMREAD/VMWRITE/VMPTRLD, VMCLEAR is a valid instruction when enlightened VMCS is in use. TLFS has the following brief description: "The L1 hypervisor can execute a VMCLEAR instruction to transition an enlightened VMCS from the active to the non-active state". Normally, this change can be ignored as unmapping active eVMCS can be postponed until the next VMLAUNCH instruction but in case nested state is migrated with KVM_GET_NESTED_STATE/KVM_SET_NESTED_STATE, keeping eVMCS mapped may result in its synchronization with VMCS12 and this is incorrect: L1 hypervisor is free to reuse inactive eVMCS memory for something else. Inactive eVMCS after VMCLEAR can just be unmapped. Reviewed-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 46de0147ca01..6a3fdb90870a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5026,6 +5026,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); + } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { + nested_release_evmcs(vcpu); } return nested_vmx_succeed(vcpu); -- cgit v1.2.3 From d6bf71a18c74de61548ddad44ff95306fe85f829 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:22 +0200 Subject: KVM: nVMX: Ignore 'hv_clean_fields' data when eVMCS data is copied in vmx_get_nested_state() 'Clean fields' data from enlightened VMCS is only valid upon vmentry: L1 hypervisor is not obliged to keep it up-to-date while it is mangling L2's state, KVM_GET_NESTED_STATE request may come at a wrong moment when actual eVMCS changes are unsynchronized with 'hv_clean_fields'. As upon migration VMCS12 is used as a source of ultimate truth, we must make sure we pick all the changes to eVMCS and thus 'clean fields' data must be ignored. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6a3fdb90870a..3787be104ff0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1586,7 +1586,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } -static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; @@ -1595,7 +1595,7 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; @@ -1603,23 +1603,23 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->guest_interruptibility_info; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { vmcs12->cpu_based_vm_exec_control = evmcs->cpu_based_vm_exec_control; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { vmcs12->exception_bitmap = evmcs->exception_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { vmcs12->vm_entry_controls = evmcs->vm_entry_controls; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { vmcs12->vm_entry_intr_info_field = evmcs->vm_entry_intr_info_field; @@ -1629,7 +1629,7 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->vm_entry_instruction_len; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { vmcs12->host_ia32_pat = evmcs->host_ia32_pat; vmcs12->host_ia32_efer = evmcs->host_ia32_efer; @@ -1649,7 +1649,7 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->host_tr_selector = evmcs->host_tr_selector; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { vmcs12->pin_based_vm_exec_control = evmcs->pin_based_vm_exec_control; @@ -1658,18 +1658,18 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) evmcs->secondary_vm_exec_control; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { vmcs12->io_bitmap_a = evmcs->io_bitmap_a; vmcs12->io_bitmap_b = evmcs->io_bitmap_b; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { vmcs12->msr_bitmap = evmcs->msr_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { vmcs12->guest_es_base = evmcs->guest_es_base; vmcs12->guest_cs_base = evmcs->guest_cs_base; @@ -1709,14 +1709,14 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->guest_tr_selector = evmcs->guest_tr_selector; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { vmcs12->tsc_offset = evmcs->tsc_offset; vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; @@ -1728,7 +1728,7 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->guest_dr7 = evmcs->guest_dr7; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { vmcs12->host_fs_base = evmcs->host_fs_base; vmcs12->host_gs_base = evmcs->host_gs_base; @@ -1738,13 +1738,13 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) vmcs12->host_rsp = evmcs->host_rsp; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { vmcs12->ept_pointer = evmcs->ept_pointer; vmcs12->virtual_processor_id = evmcs->virtual_processor_id; } - if (unlikely(!(evmcs->hv_clean_fields & + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; @@ -3509,7 +3509,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return nested_vmx_failInvalid(vcpu); if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { - copy_enlightened_to_vmcs12(vmx); + copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { @@ -6155,7 +6155,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) - copy_enlightened_to_vmcs12(vmx); + /* + * L1 hypervisor is not obliged to keep eVMCS + * clean fields data always up-to-date while + * not in guest mode, 'hv_clean_fields' is only + * supposed to be actual upon vmentry so we need + * to ignore it here and do full copy. + */ + copy_enlightened_to_vmcs12(vmx, 0); else if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); } -- cgit v1.2.3 From b7685cfd5e96456be653b61c405ea65f8de95bd6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:23 +0200 Subject: KVM: nVMX: Force enlightened VMCS sync from nested_vmx_failValid() 'need_vmcs12_to_shadow_sync' is used for both shadow and enlightened VMCS sync when we exit to L1. The comment in nested_vmx_failValid() validly states why shadow vmcs sync can be omitted but this doesn't apply to enlightened VMCS as it 'shadows' all VMCS12 fields. Reviewed-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-9-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3787be104ff0..c73668b97f5e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -173,9 +173,13 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, | X86_EFLAGS_ZF); get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; /* - * We don't need to force a shadow sync because - * VM_INSTRUCTION_ERROR is not shadowed + * We don't need to force sync to shadow VMCS because + * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all + * fields and thus must be synced. */ + if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; + return kvm_skip_emulated_instruction(vcpu); } -- cgit v1.2.3 From dc313385529f1a1fa20b06bb61239a31aca9d40f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:24 +0200 Subject: KVM: nVMX: Reset eVMCS clean fields data from prepare_vmcs02() When nested state migration happens during L1's execution, it is incorrect to modify eVMCS as it is L1 who 'owns' it at the moment. At least genuine Hyper-V seems to not be very happy when 'clean fields' data changes underneath it. 'Clean fields' data is used in KVM twice: by copy_enlightened_to_vmcs12() and prepare_vmcs02_rare() so we can reset it from prepare_vmcs02() instead. While at it, update a comment stating why exactly we need to reset 'hv_clean_fields' data from L0. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-10-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index c73668b97f5e..2bdc9a24440f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2059,14 +2059,10 @@ void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) copy_vmcs12_to_enlightened(vmx); - /* All fields are clean */ - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - } else { + else copy_vmcs12_to_shadow(vmx); - } vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -2616,6 +2612,17 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); + + /* + * It was observed that genuine Hyper-V running in L1 doesn't reset + * 'hv_clean_fields' by itself, it only sets the corresponding dirty + * bits when it changes a field in eVMCS. Mark all fields as clean + * here. + */ + if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + return 0; } -- cgit v1.2.3 From 8629b625e0151c0d6b78a938744ffd74da422682 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 26 May 2021 15:20:25 +0200 Subject: KVM: nVMX: Request to sync eVMCS from VMCS12 after migration VMCS12 is used to keep the authoritative state during nested state migration. In case 'need_vmcs12_to_shadow_sync' flag is set, we're in between L2->L1 vmexit and L1 guest run when actual sync to enlightened (or shadow) VMCS happens. Nested state, however, has no flag for 'need_vmcs12_to_shadow_sync' so vmx_set_nested_state()-> set_current_vmptr() always sets it. Enlightened vmptrld path, however, doesn't have the quirk so some VMCS12 changes may not get properly reflected to eVMCS and L1 will see an incorrect state. Note, during L2 execution or when need_vmcs12_to_shadow_sync is not set the change is effectively a nop: in the former case all changes will get reflected during the first L2->L1 vmexit and in the later case VMCS12 and eVMCS are already in sync (thanks to copy_enlightened_to_vmcs12() in vmx_get_nested_state()). Reviewed-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini Message-Id: <20210526132026.270394-11-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2bdc9a24440f..ee89b48730b6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3120,6 +3120,12 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) if (evmptrld_status == EVMPTRLD_VMFAIL || evmptrld_status == EVMPTRLD_ERROR) return false; + + /* + * Post migration VMCS12 always provides the most actual + * information, copy it to eVMCS upon entry. + */ + vmx->nested.need_vmcs12_to_shadow_sync = true; } return true; -- cgit v1.2.3 From 07ffaf343e34b555c9e7ea39a9c81c439a706f13 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:21 -0700 Subject: KVM: nVMX: Sync all PGDs on nested transition with shadow paging Trigger a full TLB flush on behalf of the guest on nested VM-Enter and VM-Exit when VPID is disabled for L2. kvm_mmu_new_pgd() syncs only the current PGD, which can theoretically leave stale, unsync'd entries in a previous guest PGD, which could be consumed if L2 is allowed to load CR3 with PCID_NOFLUSH=1. Rename KVM_REQ_HV_TLB_FLUSH to KVM_REQ_TLB_FLUSH_GUEST so that it can be utilized for its obvious purpose of emulating a guest TLB flush. Note, there is no change the actual TLB flush executed by KVM, even though the fast PGD switch uses KVM_REQ_TLB_FLUSH_CURRENT. When VPID is disabled for L2, vpid02 is guaranteed to be '0', and thus nested_get_vpid02() will return the VPID that is shared by L1 and L2. Generate the request outside of kvm_mmu_new_pgd(), as getting the common helper to correctly identify which requested is needed is quite painful. E.g. using KVM_REQ_TLB_FLUSH_GUEST when nested EPT is in play is wrong as a TLB flush from the L1 kernel's perspective does not invalidate EPT mappings. And, by using KVM_REQ_TLB_FLUSH_GUEST, nVMX can do future simplification by moving the logic into nested_vmx_transition_tlb_flush(). Fixes: 41fab65e7c44 ("KVM: nVMX: Skip MMU sync on nested VMX transition when possible") Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/vmx/nested.c | 17 ++++++++++++----- arch/x86/kvm/x86.c | 2 +- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 383106901fe2..f44a9795b91f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -85,7 +85,7 @@ #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) -#define KVM_REQ_HV_TLB_FLUSH \ +#define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4f911dca7dd6..b07592ca92f0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1829,7 +1829,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, NULL, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ee89b48730b6..a9906c8344b8 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1131,12 +1131,19 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, /* * Unconditionally skip the TLB flush on fast CR3 switch, all TLB - * flushes are handled by nested_vmx_transition_tlb_flush(). See - * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. + * flushes are handled by nested_vmx_transition_tlb_flush(). */ - if (!nested_ept) - kvm_mmu_new_pgd(vcpu, cr3, true, - !nested_vmx_transition_mmu_sync(vcpu)); + if (!nested_ept) { + kvm_mmu_new_pgd(vcpu, cr3, true, true); + + /* + * A TLB flush on VM-Enter/VM-Exit flushes all linear mappings + * across all PCIDs, i.e. all PGDs need to be synchronized. + * See nested_vmx_transition_mmu_sync() for more details. + */ + if (nested_vmx_transition_mmu_sync(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1727178b8961..efcdd1f46d64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9279,7 +9279,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- cgit v1.2.3 From 0e75225dfa4c5d5d51291f54a3d2d5895bad38da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:22 -0700 Subject: KVM: nVMX: Ensure 64-bit shift when checking VMFUNC bitmap Use BIT_ULL() instead of an open-coded shift to check whether or not a function is enabled in L1's VMFUNC bitmap. This is a benign bug as KVM supports only bit 0, and will fail VM-Enter if any other bits are set, i.e. bits 63:32 are guaranteed to be zero. Note, "function" is bounded by hardware as VMFUNC will #UD before taking a VM-Exit if the function is greater than 63. Before: if ((vmcs12->vm_function_control & (1 << function)) == 0) 0x000000000001a916 <+118>: mov $0x1,%eax 0x000000000001a91b <+123>: shl %cl,%eax 0x000000000001a91d <+125>: cltq 0x000000000001a91f <+127>: and 0x128(%rbx),%rax After: if (!(vmcs12->vm_function_control & BIT_ULL(function & 63))) 0x000000000001a955 <+117>: mov 0x128(%rbx),%rdx 0x000000000001a95c <+124>: bt %rax,%rdx Fixes: 27c42a1bb867 ("KVM: nVMX: Enable VMFUNC for the L1 hypervisor") Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a9906c8344b8..775df9e2ff88 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5598,7 +5598,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) } vmcs12 = get_vmcs12(vcpu); - if ((vmcs12->vm_function_control & (1 << function)) == 0) + if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail; switch (function) { -- cgit v1.2.3 From 272b0a998d084e7667284bdd2d0c675c6a2d11de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:23 -0700 Subject: KVM: nVMX: Don't clobber nested MMU's A/D status on EPTP switch Drop bogus logic that incorrectly clobbers the accessed/dirty enabling status of the nested MMU on an EPTP switch. When nested EPT is enabled, walk_mmu points at L2's _legacy_ page tables, not L1's EPT for L2. This is likely a benign bug, as mmu->ept_ad is never consumed (since the MMU is not a nested EPT MMU), and stuffing mmu_role.base.ad_disabled will never propagate into future shadow pages since the nested MMU isn't used to map anything, just to walk L2's page tables. Note, KVM also does a full MMU reload, i.e. the guest_mmu will be recreated using the new EPTP, and thus any change in A/D enabling will be properly recognized in the relevant MMU. Fixes: 41ab93727467 ("KVM: nVMX: Emulate EPTP switching for the L1 hypervisor") Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 775df9e2ff88..7210e7ca0af4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5546,8 +5546,6 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, { u32 index = kvm_rcx_read(vcpu); u64 new_eptp; - bool accessed_dirty; - struct kvm_mmu *mmu = vcpu->arch.walk_mmu; if (!nested_cpu_has_eptp_switching(vmcs12) || !nested_cpu_has_ept(vmcs12)) @@ -5556,13 +5554,10 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (index >= VMFUNC_EPTP_ENTRIES) return 1; - if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, &new_eptp, index * 8, 8)) return 1; - accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); - /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else @@ -5571,8 +5566,6 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; - mmu->ept_ad = accessed_dirty; - mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = new_eptp; kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); -- cgit v1.2.3 From 21823fbda552252271c948850f80f15edfdf25b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:24 -0700 Subject: KVM: x86: Invalidate all PGDs for the current PCID on MOV CR3 w/ flush Flush and sync all PGDs for the current/target PCID on MOV CR3 with a TLB flush, i.e. without PCID_NOFLUSH set. Paraphrasing Intel's SDM regarding the behavior of MOV to CR3: - If CR4.PCIDE = 0, invalidates all TLB entries associated with PCID 000H and all entries in all paging-structure caches associated with PCID 000H. - If CR4.PCIDE = 1 and NOFLUSH=0, invalidates all TLB entries associated with the PCID specified in bits 11:0, and all entries in all paging-structure caches associated with that PCID. It is not required to invalidate entries in the TLBs and paging-structure caches that are associated with other PCIDs. - If CR4.PCIDE=1 and NOFLUSH=1, is not required to invalidate any TLB entries or entries in paging-structure caches. Extract and reuse the logic for INVPCID(single) which is effectively the same flow and works even if CR4.PCIDE=0, as the current PCID will be '0' in that case, thus honoring the requirement of flushing PCID=0. Continue passing skip_tlb_flush to kvm_mmu_new_pgd() even though it _should_ be redundant; the clean up will be done in a future patch. The overhead of an unnecessary nop sync is minimal (especially compared to the actual sync), and the TLB flush is handled via request. Avoiding the the negligible overhead is not worth the risk of breaking kernels that backport the fix. Fixes: 956bf3531fba ("kvm: x86: Skip shadow page resync on CR3 switch when indicated by guest") Cc: Junaid Shahid Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 69 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efcdd1f46d64..8ed5f3252e9d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1062,26 +1062,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } EXPORT_SYMBOL_GPL(kvm_set_cr4); +static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + unsigned long roots_to_free = 0; + int i; + + /* + * If neither the current CR3 nor any of the prev_roots use the given + * PCID, then nothing needs to be done here because a resync will + * happen anyway before switching to any other CR3. + */ + if (kvm_get_active_pcid(vcpu) == pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); +} + int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { bool skip_tlb_flush = false; + unsigned long pcid = 0; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); if (pcid_enabled) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; + pcid = cr3 & X86_CR3_PCID_MASK; } #endif /* PDPTRs are always reloaded for PAE paging. */ - if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) { - if (!skip_tlb_flush) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - return 0; - } + if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) + goto handle_tlb_flush; /* * Do not condition the GPA check on long mode, this helper is used to @@ -1094,10 +1114,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); + if (cr3 != kvm_read_cr3(vcpu)) + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); + vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); +handle_tlb_flush: + /* + * A load of CR3 that flushes the TLB flushes only the current PCID, + * even if PCID is disabled, in which case PCID=0 is flushed. It's a + * moot point in the end because _disabling_ PCID will flush all PCIDs, + * and it's impossible to use a non-zero PCID when PCID is disabled, + * i.e. only PCID=0 can be relevant. + */ + if (!skip_tlb_flush) + kvm_invalidate_pcid(vcpu, pcid); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); @@ -11952,8 +11985,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { bool pcid_enabled; struct x86_exception e; - unsigned i; - unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; @@ -11987,23 +12018,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } - if (kvm_get_active_pcid(vcpu) == operand.pcid) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) - == operand.pcid) - roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - - kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); - /* - * If neither the current cr3 nor any of the prev_roots use the - * given PCID, then nothing needs to be done here because a - * resync will happen anyway before switching to any other CR3. - */ - + kvm_invalidate_pcid(vcpu, operand.pcid); return kvm_skip_emulated_instruction(vcpu); case INVPCID_TYPE_ALL_NON_GLOBAL: -- cgit v1.2.3 From 415b1a0105cd05a428f8b28ac1bf406ca2b4bbd7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:25 -0700 Subject: KVM: x86: Uncondtionally skip MMU sync/TLB flush in MOV CR3's PGD switch Stop leveraging the MMU sync and TLB flush requested by the fast PGD switch helper now that kvm_set_cr3() manually handles the necessary sync, frees, and TLB flush. This will allow dropping the params from the fast PGD helpers since nested SVM is now the odd blob out. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8ed5f3252e9d..7d2c7a3306b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1115,7 +1115,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 1; if (cr3 != kvm_read_cr3(vcpu)) - kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, true, true); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); -- cgit v1.2.3 From d2e5601907bd294411920a84c0231473557d16b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:26 -0700 Subject: KVM: nSVM: Move TLB flushing logic (or lack thereof) to dedicated helper Introduce nested_svm_transition_tlb_flush() and use it force an MMU sync and TLB flush on nSVM VM-Enter and VM-Exit instead of sneaking the logic into the __kvm_mmu_new_pgd() call sites. Add a partial todo list to document issues that need to be addressed before the unconditional sync and flush can be modified to look more like nVMX's logic. In addition to making nSVM's forced flushing more overt (guess who keeps losing track of it), the new helper brings further convergence between nSVM and nVMX, and also sets the stage for dropping the "skip" params from __kvm_mmu_new_pgd(). Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index aa9e77f406d9..64d734239efa 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4684,7 +4684,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, true, true); if (new_role.as_u64 != context->mmu_role.as_u64) { shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c902ace2bd17..20e672236a75 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -380,6 +380,25 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm) return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; } +static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu) +{ + /* + * TODO: optimize unconditional TLB flush/MMU sync. A partial list of + * things to fix before this can be conditional: + * + * - Flush TLBs for both L1 and L2 remote TLB flush + * - Honor L1's request to flush an ASID on nested VMRUN + * - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*] + * - Don't crush a pending TLB flush in vmcb02 on nested VMRUN + * - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST + * + * [*] Unlike nested EPT, SVM's ASID management can invalidate nested + * NPT guest-physical mappings on VMRUN. + */ + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); +} + /* * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true * if we are emulating VM-Entry into a guest with NPT enabled. @@ -394,12 +413,8 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) return -EINVAL; - /* - * TODO: optimize unconditional TLB flush/MMU sync here and in - * kvm_init_shadow_npt_mmu(). - */ if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3, false, false); + kvm_mmu_new_pgd(vcpu, cr3, true, true); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -479,6 +494,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + struct kvm_vcpu *vcpu = &svm->vcpu; /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, @@ -503,10 +519,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) /* nested_cr3. */ if (nested_npt_enabled(svm)) - nested_svm_init_mmu_context(&svm->vcpu); + nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = - svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = + vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & ~mask) | @@ -521,8 +537,10 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; + nested_svm_transition_tlb_flush(vcpu); + /* Enter Guest-Mode */ - enter_guest_mode(&svm->vcpu); + enter_guest_mode(vcpu); /* * Merge guest and host intercepts - must be called with vcpu in @@ -799,6 +817,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_vcpu_unmap(vcpu, &map, true); + nested_svm_transition_tlb_flush(vcpu); + nested_svm_uninit_mmu_context(vcpu); rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true); -- cgit v1.2.3 From b5129100398ac3b6364cfa6dbd55abfd36cf7202 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:27 -0700 Subject: KVM: x86: Drop skip MMU sync and TLB flush params from "new PGD" helpers Drop skip_mmu_sync and skip_tlb_flush from __kvm_mmu_new_pgd() now that all call sites unconditionally skip both the sync and flush. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/mmu/mmu.c | 17 +++++++---------- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/vmx/nested.c | 6 +----- arch/x86/kvm/x86.c | 2 +- 5 files changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f44a9795b91f..d866bfec1337 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1708,8 +1708,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, - bool skip_mmu_sync); +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, int tdp_huge_page_level); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 64d734239efa..894b9a4a5961 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3949,8 +3949,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, } static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, - union kvm_mmu_page_role new_role, - bool skip_tlb_flush, bool skip_mmu_sync) + union kvm_mmu_page_role new_role) { if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); @@ -3965,10 +3964,10 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_mmu_sync || force_flush_and_sync_on_reuse) + if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - if (!skip_tlb_flush || force_flush_and_sync_on_reuse) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When @@ -3987,11 +3986,9 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, to_shadow_page(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, - bool skip_mmu_sync) +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { - __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush, skip_mmu_sync); + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); @@ -4684,7 +4681,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); - __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, true, true); + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); if (new_role.as_u64 != context->mmu_role.as_u64) { shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); @@ -4736,7 +4733,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); if (new_role.as_u64 == context->mmu_role.as_u64) return; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 20e672236a75..5f45991edcda 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -414,7 +414,7 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; if (!nested_npt) - kvm_mmu_new_pgd(vcpu, cr3, true, true); + kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7210e7ca0af4..d07b83b1bd3c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1129,12 +1129,8 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; } - /* - * Unconditionally skip the TLB flush on fast CR3 switch, all TLB - * flushes are handled by nested_vmx_transition_tlb_flush(). - */ if (!nested_ept) { - kvm_mmu_new_pgd(vcpu, cr3, true, true); + kvm_mmu_new_pgd(vcpu, cr3); /* * A TLB flush on VM-Enter/VM-Exit flushes all linear mappings diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7d2c7a3306b7..1a0fb0f1c1cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1115,7 +1115,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) return 1; if (cr3 != kvm_read_cr3(vcpu)) - kvm_mmu_new_pgd(vcpu, cr3, true, true); + kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); -- cgit v1.2.3 From 50a417962a80525da54fa74105bcf17b479cd4bc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:28 -0700 Subject: KVM: nVMX: Consolidate VM-Enter/VM-Exit TLB flush and MMU sync logic Drop the dedicated nested_vmx_transition_mmu_sync() now that the MMU sync is handled via KVM_REQ_TLB_FLUSH_GUEST, and fold that flush into the all-encompassing nested_vmx_transition_tlb_flush(). Opportunistically add a comment explaning why nested EPT never needs to sync the MMU on VM-Enter. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 87 +++++++++++++---------------------------------- 1 file changed, 23 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d07b83b1bd3c..8e2487f21a6f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1062,48 +1062,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, } } -/* - * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. - * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't - * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). - * Here's why. - * - * If EPT is enabled by L0 a sync is never needed: - * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there - * cannot be unsync'd SPTEs for either L1 or L2. - * - * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter - * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings - * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush - * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't - * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. - * - * If EPT is disabled by L0: - * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 - * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't - * required to invalidate linear mappings (EPT is disabled so there are - * no combined or guest-physical mappings), i.e. L1 can't rely on the - * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). - * - * - however if VPID is disabled by L1, then a sync is needed as L1 expects all - * linear mappings (EPT is disabled so there are no combined or guest-physical - * mappings) to be invalidated on both VM-Enter and VM-Exit. - * - * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which - * additionally checks that L2 has been assigned a VPID (when EPT is disabled). - * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect - * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 - * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has - * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 - * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't - * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush - * stale TLB entries, at which point L0 will sync L2's MMU. - */ -static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) -{ - return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); -} - /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected @@ -1129,18 +1087,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, return -EINVAL; } - if (!nested_ept) { + if (!nested_ept) kvm_mmu_new_pgd(vcpu, cr3); - /* - * A TLB flush on VM-Enter/VM-Exit flushes all linear mappings - * across all PCIDs, i.e. all PGDs need to be synchronized. - * See nested_vmx_transition_mmu_sync() for more details. - */ - if (nested_vmx_transition_mmu_sync(vcpu)) - kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); - } - vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); @@ -1177,17 +1126,28 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * If VPID is disabled, linear and combined mappings are flushed on - * VM-Enter/VM-Exit, and guest-physical mappings are valid only for - * their associated EPTP. + * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings + * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a + * full TLB flush from the guest's perspective. This is required even + * if VPID is disabled in the host as KVM may need to synchronize the + * MMU in response to the guest TLB flush. + * + * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. + * EPT is a special snowflake, as guest-physical mappings aren't + * flushed on VPID invalidations, including VM-Enter or VM-Exit with + * VPID disabled. As a result, KVM _never_ needs to sync nEPT + * entries on VM-Enter because L1 can't rely on VM-Enter to flush + * those mappings. */ - if (!enable_vpid) + if (!nested_cpu_has_vpid(vmcs12)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return; + } + + /* L2 should never have a VPID if VPID is disabled. */ + WARN_ON(!enable_vpid); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit. - * * If VPID is enabled and used by vmc12, but L2 does not have a unique * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate * a VPID for L2, flush the current context as the effective ASID is @@ -1199,13 +1159,12 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, * * If a TLB flush isn't required due to any of the above, and vpid12 is * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be sync'd. There is no direct + * "real" VPID (vpid02), and so needs to be flushed. There's no direct * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. + * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate + * guest-physical mappings, so there is no need to sync the nEPT MMU. */ - if (!nested_cpu_has_vpid(vmcs12)) { - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } else if (!nested_has_guest_tlb_tag(vcpu)) { + if (!nested_has_guest_tlb_tag(vcpu)) { kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } else if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { -- cgit v1.2.3 From 25b62c6274ed466fe2e9f3a681e46d99e6703fd4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:29 -0700 Subject: KVM: nVMX: Free only guest_mode (L2) roots on INVVPID w/o EPT When emulating INVVPID for L1, free only L2+ roots, using the guest_mode tag in the MMU role to identify L2+ roots. From L1's perspective, its own TLB entries use VPID=0, and INVVPID is not requied to invalidate such entries. Per Intel's SDM, INVVPID _may_ invalidate entries with VPID=0, but it is not required to do so. Cc: Lai Jiangshan Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 27 +++++++++++++++++++++++++++ arch/x86/kvm/vmx/nested.c | 7 +++---- 3 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d866bfec1337..a92d56590613 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1684,6 +1684,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, ulong roots_to_free); +void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 894b9a4a5961..f4fea68a88f6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3212,6 +3212,33 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); +void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned long roots_to_free = 0; + hpa_t root_hpa; + int i; + + /* + * This should not be called while L2 is active, L2 can't invalidate + * _only_ its own roots, e.g. INVVPID unconditionally exits. + */ + WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + root_hpa = mmu->prev_roots[i].hpa; + if (!VALID_PAGE(root_hpa)) + continue; + + if (!to_shadow_page(root_hpa) || + to_shadow_page(root_hpa)->role.guest_mode) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + } + + kvm_mmu_free_roots(vcpu, mmu, roots_to_free); +} +EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); + + static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8e2487f21a6f..13a4accca348 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5481,8 +5481,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* * Sync the shadow page tables if EPT is disabled, L1 is invalidating - * linear mappings for L2 (tagged with L2's VPID). Free all roots as - * VPIDs are not tracked in the MMU role. + * linear mappings for L2 (tagged with L2's VPID). Free all guest + * roots as VPIDs are not tracked in the MMU role. * * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share * an MMU when EPT is disabled. @@ -5490,8 +5490,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. */ if (!enable_ept) - kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, - KVM_MMU_ROOTS_ALL); + kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); return nested_vmx_succeed(vcpu); } -- cgit v1.2.3 From 28f28d453ffcca4a45c1fd93666d9e77a48cb45b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:30 -0700 Subject: KVM: x86: Use KVM_REQ_TLB_FLUSH_GUEST to handle INVPCID(ALL) emulation Use KVM_REQ_TLB_FLUSH_GUEST instead of KVM_REQ_MMU_RELOAD when emulating INVPCID of all contexts. In the current code, this is a glorified nop as TLB_FLUSH_GUEST becomes kvm_mmu_unload(), same as MMU_RELOAD, when TDP is disabled, which is the only time INVPCID is only intercepted+emulated. In the future, reusing TLB_FLUSH_GUEST will simplify optimizing paths that emulate a guest TLB flush, e.g. by synchronizing as needed instead of completely unloading all MMUs. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1a0fb0f1c1cb..41b936187b2c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12031,7 +12031,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) fallthrough; case INVPCID_TYPE_ALL_INCL_GLOBAL: - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); return kvm_skip_emulated_instruction(vcpu); default: -- cgit v1.2.3 From 39353ab5790be2802b0de29caeba43015fb90dcf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:31 -0700 Subject: KVM: nVMX: Use fast PGD switch when emulating VMFUNC[EPTP_SWITCH] Use __kvm_mmu_new_pgd() via kvm_init_shadow_ept_mmu() to emulate VMFUNC[EPTP_SWITCH] instead of nuking all MMUs. EPTP_SWITCH is the EPT equivalent of MOV to CR3, i.e. is a perfect fit for the common PGD flow, the only hiccup being that A/D enabling is buried in the EPTP. But, that is easily handled by bouncing through kvm_init_shadow_ept_mmu(). Explicitly request a guest TLB flush if VPID is disabled. Per Intel's SDM, if VPID is disabled, "an EPTP-switching VMFUNC invalidates combined mappings associated with VPID 0000H (for all PCIDs and for all EP4TA values, where EP4TA is the value of bits 51:12 of EPTP)". Note, this technically is a very bizarre bug fix of sorts if L2 is using PAE paging, as avoiding the full MMU reload also avoids incorrectly reloading the PDPTEs, which the SDM explicitly states are not touched: If PAE paging is in use, an EPTP-switching VMFUNC does not load the four page-directory-pointer-table entries (PDPTEs) from the guest-physical address in CR3. The logical processor continues to use the four guest-physical addresses already present in the PDPTEs. The guest-physical address in CR3 is not translated through the new EPT paging structures (until some operation that would load the PDPTEs). In addition to optimizing L2's MMU shenanigans, avoiding the full reload also optimizes L1's MMU as KVM_REQ_MMU_RELOAD wipes out all roots in both root_mmu and guest_mmu. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 13a4accca348..23f974fee5d4 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -351,16 +351,21 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vmcs12->guest_physical_address = fault->address; } +static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) +{ + kvm_init_shadow_ept_mmu(vcpu, + to_vmx(vcpu)->nested.msrs.ept_caps & + VMX_EPT_EXECUTE_ONLY_BIT, + nested_ept_ad_enabled(vcpu), + nested_ept_get_eptp(vcpu)); +} + static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_ept_mmu(vcpu, - to_vmx(vcpu)->nested.msrs.ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu), - nested_ept_get_eptp(vcpu)); + nested_ept_new_eptp(vcpu); vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; @@ -5521,8 +5526,10 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, return 1; vmcs12->ept_pointer = new_eptp; + nested_ept_new_eptp(vcpu); - kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + if (!nested_cpu_has_vpid(vmcs12)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } return 0; -- cgit v1.2.3 From e62f1aa8b9304f4608a6a1517e9041cec555c09d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:32 -0700 Subject: KVM: x86: Defer MMU sync on PCID invalidation Defer the MMU sync on PCID invalidation so that multiple sync requests in a single VM-Exit are batched. This is a very minor optimization as checking for unsync'd children is quite cheap. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41b936187b2c..9ca30a3879d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1074,7 +1074,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) * happen anyway before switching to any other CR3. */ if (kvm_get_active_pcid(vcpu) == pcid) { - kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } -- cgit v1.2.3 From c906066288d0da7b8c2b5ac4d0d8e85f10f5d5b8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:33 -0700 Subject: KVM: x86: Drop pointless @reset_roots from kvm_init_mmu() Remove the @reset_roots param from kvm_init_mmu(), the one user, kvm_mmu_reset_context() has already unloaded the MMU and thus freed and invalidated all roots. This also happens to be why the reset_roots=true paths doesn't leak roots; they're already invalid. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/mmu/mmu.c | 13 ++----------- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 6 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9d8550af994c..bc11402df83b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -65,7 +65,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); -void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); +void kvm_init_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f4fea68a88f6..720ceb0a1f5c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4877,17 +4877,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_last_nonleaf_level(vcpu, g_context); } -void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) +void kvm_init_mmu(struct kvm_vcpu *vcpu) { - if (reset_roots) { - uint i; - - vcpu->arch.mmu->root_hpa = INVALID_PAGE; - - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - } - if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4913,7 +4904,7 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - kvm_init_mmu(vcpu, true); + kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 5f45991edcda..dca20f949b63 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -419,7 +419,7 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); - kvm_init_mmu(vcpu, false); + kvm_init_mmu(vcpu); return 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23f974fee5d4..aba11422500c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1098,7 +1098,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); - kvm_init_mmu(vcpu, false); + kvm_init_mmu(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ca30a3879d4..e050ae2fc19b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10606,7 +10606,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu_load(vcpu); kvm_set_tsc_khz(vcpu, max_tsc_khz); kvm_vcpu_reset(vcpu, false); - kvm_init_mmu(vcpu, false); + kvm_init_mmu(vcpu); vcpu_put(vcpu); return 0; -- cgit v1.2.3 From 546e8398bc0c7f75f696a24a997d2befeb632154 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:34 -0700 Subject: KVM: nVMX: WARN if subtly-impossible VMFUNC conditions occur WARN and inject #UD when emulating VMFUNC for L2 if the function is out-of-bounds or if VMFUNC is not enabled in vmcs12. Neither condition should occur in practice, as the CPU is supposed to prioritize the #UD over VM-Exit for out-of-bounds input and KVM is supposed to enable VMFUNC in vmcs02 if and only if it's enabled in vmcs12, but neither of those dependencies is obvious. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index aba11422500c..6342bb4c46b3 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5552,6 +5552,16 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) } vmcs12 = get_vmcs12(vcpu); + + /* + * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC + * is enabled in vmcs02 if and only if it's enabled in vmcs12. + */ + if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + if (!(vmcs12->vm_function_control & BIT_ULL(function))) goto fail; -- cgit v1.2.3 From c5ffd408cdc951ba153aea267d96d7cc62c6a97c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Jun 2021 16:42:35 -0700 Subject: KVM: nVMX: Drop redundant checks on vmcs12 in EPTP switching emulation Drop the explicit check on EPTP switching being enabled. The EPTP switching check is handled in the generic VMFUNC function check, while the underlying VMFUNC enablement check is done by hardware and redone by generic VMFUNC emulation. The vmcs12 EPT check is handled by KVM at VM-Enter in the form of a consistency check, keep it but add a WARN. Signed-off-by: Sean Christopherson Message-Id: <20210609234235.1244004-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6342bb4c46b3..b531e08a095b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5506,10 +5506,8 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, u32 index = kvm_rcx_read(vcpu); u64 new_eptp; - if (!nested_cpu_has_eptp_switching(vmcs12) || - !nested_cpu_has_ept(vmcs12)) + if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) return 1; - if (index >= VMFUNC_EPTP_ENTRIES) return 1; -- cgit v1.2.3 From bca66dbcd28a41c669921ff7ca066f71e6f3e72e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 9 Jun 2021 17:09:10 +0200 Subject: KVM: x86: Check for pending interrupts when APICv is getting disabled When APICv is active, interrupt injection doesn't raise KVM_REQ_EVENT request (see __apic_accept_irq()) as the required work is done by hardware. In case KVM_REQ_APICV_UPDATE collides with such injection, the interrupt may never get delivered. Currently, the described situation is hardly possible: all kvm_request_apicv_update() calls normally happen upon VM creation when no interrupts are pending. We are, however, going to move unconditional kvm_request_apicv_update() call from kvm_hv_activate_synic() to synic_update_vector() and without this fix 'hyperv_connections' test from kvm-unit-tests gets stuck on IPI delivery attempt right after configuring a SynIC route which triggers APICv disablement. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210609150911.1471882-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e050ae2fc19b..ceb60f64085c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9137,6 +9137,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + + /* + * When APICv gets disabled, we may still have injected interrupts + * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was + * still active when the interrupt got accepted. Make sure + * inject_pending_event() is called to check for that. + */ + if (!vcpu->arch.apicv_active) + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -- cgit v1.2.3 From ade74e1433f32e3fb422e3700d5bab34c57f4f47 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jun 2021 09:29:05 -0700 Subject: KVM: x86/mmu: Grab nx_lpage_splits as an unsigned long before division Snapshot kvm->stats.nx_lpage_splits into a local unsigned long to avoid 64-bit division on 32-bit kernels. Casting to an unsigned long is safe because the maximum number of shadow pages, n_max_mmu_pages, is also an unsigned long, i.e. KVM will start recycling shadow pages before the number of splits can exceed a 32-bit value. ERROR: modpost: "__udivdi3" [arch/x86/kvm/kvm.ko] undefined! Fixes: 7ee093d4f3f5 ("KVM: switch per-VM stats to u64") Signed-off-by: Sean Christopherson Message-Id: <20210615162905.2132937-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 720ceb0a1f5c..7d3e57678d34 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6043,6 +6043,7 @@ static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel static void kvm_recover_nx_lpages(struct kvm *kvm) { + unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; @@ -6054,7 +6055,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) write_lock(&kvm->mmu_lock); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) break; -- cgit v1.2.3 From e3cb6fa0e2bf4ffc6225a55851f0cf2b93b50f91 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 10 Jun 2021 12:30:32 -0400 Subject: KVM: switch per-VM stats to u64 Make them the same type as vCPU stats. There is no reason to limit the counters to unsigned long. Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/mips/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 6 +++--- arch/x86/include/asm/kvm_host.h | 22 +++++++++++----------- virt/kvm/kvm_main.c | 4 ++-- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7cd7d5c8c4bc..d56f365b38a8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -556,7 +556,7 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) } struct kvm_vm_stat { - ulong remote_tlb_flush; + u64 remote_tlb_flush; }; struct kvm_vcpu_stat { diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index fca4547d580f..4245c082095f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -109,7 +109,7 @@ static inline bool kvm_is_error_hva(unsigned long addr) } struct kvm_vm_stat { - ulong remote_tlb_flush; + u64 remote_tlb_flush; }; struct kvm_vcpu_stat { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7f2e90db2050..ae3d4af61b66 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -81,9 +81,9 @@ struct kvmppc_book3s_shadow_vcpu; struct kvm_nested_guest; struct kvm_vm_stat { - ulong remote_tlb_flush; - ulong num_2M_pages; - ulong num_1G_pages; + u64 remote_tlb_flush; + u64 num_2M_pages; + u64 num_1G_pages; }; struct kvm_vcpu_stat { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a92d56590613..a0c29e29dd48 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1158,17 +1158,17 @@ struct kvm_arch { }; struct kvm_vm_stat { - ulong mmu_shadow_zapped; - ulong mmu_pte_write; - ulong mmu_pde_zapped; - ulong mmu_flooded; - ulong mmu_recycled; - ulong mmu_cache_miss; - ulong mmu_unsync; - ulong remote_tlb_flush; - ulong lpages; - ulong nx_lpage_splits; - ulong max_mmu_page_hash_collisions; + u64 mmu_shadow_zapped; + u64 mmu_pte_write; + u64 mmu_pde_zapped; + u64 mmu_flooded; + u64 mmu_recycled; + u64 mmu_cache_miss; + u64 mmu_unsync; + u64 remote_tlb_flush; + u64 lpages; + u64 nx_lpage_splits; + u64 max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fc35ba0ea5d3..ed4d1581d502 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4833,14 +4833,14 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file) static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { - *val = *(ulong *)((void *)kvm + offset); + *val = *(u64 *)((void *)kvm + offset); return 0; } static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) { - *(ulong *)((void *)kvm + offset) = 0; + *(u64 *)((void *)kvm + offset) = 0; return 0; } -- cgit v1.2.3 From 0dbb11230437895f7cd6fc55da61cef011e997d8 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 8 Jun 2021 18:05:43 +0000 Subject: KVM: X86: Introduce KVM_HC_MAP_GPA_RANGE hypercall This hypercall is used by the SEV guest to notify a change in the page encryption status to the hypervisor. The hypercall should be invoked only when the encryption attribute is changed from encrypted -> decrypted and vice versa. By default all guest pages are considered encrypted. The hypercall exits to userspace to manage the guest shared regions and integrate with the userspace VMM's migration code. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Steve Rutherford Signed-off-by: Brijesh Singh Signed-off-by: Ashish Kalra Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Message-Id: <90778988e1ee01926ff9cac447aacb745f954c8c.1623174621.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 19 +++++++++++++++ Documentation/virt/kvm/cpuid.rst | 7 ++++++ Documentation/virt/kvm/hypercalls.rst | 21 ++++++++++++++++ Documentation/virt/kvm/msr.rst | 13 ++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/include/uapi/asm/kvm_para.h | 13 ++++++++++ arch/x86/kvm/x86.c | 46 +++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 1 + include/uapi/linux/kvm_para.h | 1 + 9 files changed, 123 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index cded99561adf..e328caa35d6c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6950,3 +6950,22 @@ guest according to the bits Hyper-V CPUID feature leaves. Otherwise, all currently implmented Hyper-V features are provided unconditionally when Hyper-V identification is set in the HYPERV_CPUID_INTERFACE (0x40000001) leaf. + +8.34 KVM_CAP_EXIT_HYPERCALL +--------------------------- + +:Capability: KVM_CAP_EXIT_HYPERCALL +:Architectures: x86 +:Type: vm + +This capability, if enabled, will cause KVM to exit to userspace +with KVM_EXIT_HYPERCALL exit reason to process some hypercalls. + +Calling KVM_CHECK_EXTENSION for this capability will return a bitmask +of hypercalls that can be configured to exit to userspace. +Right now, the only such hypercall is KVM_HC_MAP_GPA_RANGE. + +The argument to KVM_ENABLE_CAP is also a bitmask, and must be a subset +of the result of KVM_CHECK_EXTENSION. KVM will forward to userspace +the hypercalls whose corresponding bit is in the argument, and return +ENOSYS for the others. diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index cf62162d4be2..bda3e3e737d7 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -96,6 +96,13 @@ KVM_FEATURE_MSI_EXT_DEST_ID 15 guest checks this feature bit before using extended destination ID bits in MSI address bits 11-5. +KVM_FEATURE_HC_MAP_GPA_RANGE 16 guest checks this feature bit before + using the map gpa range hypercall + to notify the page state change + +KVM_FEATURE_MIGRATION_CONTROL 17 guest checks this feature bit before + using MSR_KVM_MIGRATION_CONTROL + KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 host will warn if no guest-side per-cpu warps are expected in kvmclock diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst index ed4fddd364ea..e56fa8b9cfca 100644 --- a/Documentation/virt/kvm/hypercalls.rst +++ b/Documentation/virt/kvm/hypercalls.rst @@ -169,3 +169,24 @@ a0: destination APIC ID :Usage example: When sending a call-function IPI-many to vCPUs, yield if any of the IPI target vCPUs was preempted. + +8. KVM_HC_MAP_GPA_RANGE +------------------------- +:Architecture: x86 +:Status: active +:Purpose: Request KVM to map a GPA range with the specified attributes. + +a0: the guest physical address of the start page +a1: the number of (4kb) pages (must be contiguous in GPA space) +a2: attributes + + Where 'attributes' : + * bits 3:0 - preferred page size encoding 0 = 4kb, 1 = 2mb, 2 = 1gb, etc... + * bit 4 - plaintext = 0, encrypted = 1 + * bits 63:5 - reserved (must be zero) + +**Implementation note**: this hypercall is implemented in userspace via +the KVM_CAP_EXIT_HYPERCALL capability. Userspace must enable that capability +before advertising KVM_FEATURE_HC_MAP_GPA_RANGE in the guest CPUID. In +addition, if the guest supports KVM_FEATURE_MIGRATION_CONTROL, userspace +must also set up an MSR filter to process writes to MSR_KVM_MIGRATION_CONTROL. diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index e37a14c323d2..9315fc385fb0 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -376,3 +376,16 @@ data: write '1' to bit 0 of the MSR, this causes the host to re-scan its queue and check if there are more notifications pending. The MSR is available if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. + +MSR_KVM_MIGRATION_CONTROL: + 0x4b564d08 + +data: + This MSR is available if KVM_FEATURE_MIGRATION_CONTROL is present in + CPUID. Bit 0 represents whether live migration of the guest is allowed. + + When a guest is started, bit 0 will be 0 if the guest has encrypted + memory and 1 if the guest does not have encrypted memory. If the + guest is communicating page encryption status to the host using the + ``KVM_HC_MAP_GPA_RANGE`` hypercall, it can set bit 0 in this MSR to + allow live migration of the guest. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0c29e29dd48..e11d64aa0bcd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1087,6 +1087,8 @@ struct kvm_arch { u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter; + u32 hypercall_exit_enabled; + /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 950afebfba88..5146bbab84d4 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -33,6 +33,8 @@ #define KVM_FEATURE_PV_SCHED_YIELD 13 #define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_FEATURE_MSI_EXT_DEST_ID 15 +#define KVM_FEATURE_HC_MAP_GPA_RANGE 16 +#define KVM_FEATURE_MIGRATION_CONTROL 17 #define KVM_HINTS_REALTIME 0 @@ -54,6 +56,7 @@ #define MSR_KVM_POLL_CONTROL 0x4b564d05 #define MSR_KVM_ASYNC_PF_INT 0x4b564d06 #define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 +#define MSR_KVM_MIGRATION_CONTROL 0x4b564d08 struct kvm_steal_time { __u64 steal; @@ -90,6 +93,16 @@ struct kvm_clock_pairing { /* MSR_KVM_ASYNC_PF_INT */ #define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +/* MSR_KVM_MIGRATION_CONTROL */ +#define KVM_MIGRATION_READY (1 << 0) + +/* KVM_HC_MAP_GPA_RANGE */ +#define KVM_MAP_GPA_RANGE_PAGE_SZ_4K 0 +#define KVM_MAP_GPA_RANGE_PAGE_SZ_2M (1 << 0) +#define KVM_MAP_GPA_RANGE_PAGE_SZ_1G (1 << 1) +#define KVM_MAP_GPA_RANGE_ENC_STAT(n) (n << 4) +#define KVM_MAP_GPA_RANGE_ENCRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(1) +#define KVM_MAP_GPA_RANGE_DECRYPTED KVM_MAP_GPA_RANGE_ENC_STAT(0) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ceb60f64085c..8b898ec8d349 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -103,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -3996,6 +3998,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SREGS2: r = 1; break; + case KVM_CAP_EXIT_HYPERCALL: + r = KVM_EXIT_HYPERCALL_VALID_MASK; + break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; #ifdef CONFIG_KVM_XEN @@ -5622,6 +5627,14 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_EXIT_HYPERCALL: + if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { + r = -EINVAL; + break; + } + kvm->arch.hypercall_exit_enabled = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -8548,6 +8561,17 @@ no_yield: return; } +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + u64 ret = vcpu->run->hypercall.ret; + + if (!is_64_bit_mode(vcpu)) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; @@ -8613,6 +8637,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_sched_yield(vcpu, a0); ret = 0; break; + case KVM_HC_MAP_GPA_RANGE: { + u64 gpa = a0, npages = a1, attrs = a2; + + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + if (!PAGE_ALIGNED(gpa) || !npages || + gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) { + ret = -KVM_EINVAL; + break; + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = attrs; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + } default: ret = -KVM_ENOSYS; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 90d44138dbfb..9febe1412f7a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1085,6 +1085,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PTP_KVM 198 #define KVM_CAP_HYPERV_ENFORCE_CPUID 199 #define KVM_CAP_SREGS2 200 +#define KVM_CAP_EXIT_HYPERCALL 201 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index 8b86609849b9..960c7e93d1a9 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -29,6 +29,7 @@ #define KVM_HC_CLOCK_PAIRING 9 #define KVM_HC_SEND_IPI 10 #define KVM_HC_SCHED_YIELD 11 +#define KVM_HC_MAP_GPA_RANGE 12 /* * hypercalls use architecture specific -- cgit v1.2.3 From 2735886c9ef115fc7b40d27bfe73605c38e9d56b Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 9 Jun 2021 00:16:40 -0700 Subject: KVM: LAPIC: Keep stored TMCCT register value 0 after KVM_SET_LAPIC KVM_GET_LAPIC stores the current value of TMCCT and KVM_SET_LAPIC's memcpy stores it in vcpu->arch.apic->regs, KVM_SET_LAPIC could store zero in vcpu->arch.apic->regs after it uses it, and then the stored value would always be zero. In addition, the TMCCT is always computed on-demand and never directly readable. Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li Message-Id: <1623223000-18116-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b80e613096b..ba5a27879f1d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2631,6 +2631,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); __start_apic_timer(apic, APIC_TMCCT); + kvm_lapic_set_reg(apic, APIC_TMCCT, 0); kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { -- cgit v1.2.3 From 57a3e96d6d17ae5ac9861ef34af024a627f1c3bb Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 12:57:09 +1200 Subject: KVM: x86/mmu: Fix return value in tdp_mmu_map_handle_target_level() Currently tdp_mmu_map_handle_target_level() returns 0, which is RET_PF_RETRY, when page fault is actually fixed. This makes kvm_tdp_mmu_map() also return RET_PF_RETRY in this case, instead of RET_PF_FIXED. Fix by initializing ret to RET_PF_FIXED. Note that kvm_mmu_page_fault() resumes guest on both RET_PF_RETRY and RET_PF_FIXED, which means in practice returning the two won't make difference, so this fix alone won't be necessary for stable tree. Fixes: bb18842e2111 ("kvm: x86/mmu: Add TDP MMU PF handler") Reviewed-by: Sean Christopherson Reviewed-by: Ben Gardon Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index cc13e001f3de..6c9c6917925a 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -914,7 +914,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, kvm_pfn_t pfn, bool prefault) { u64 new_spte; - int ret = 0; + int ret = RET_PF_FIXED; int make_spte_ret = 0; if (unlikely(is_noslot_pfn(pfn))) -- cgit v1.2.3 From 857f84743e4b78500afae010d866675642e18e90 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 12:57:10 +1200 Subject: KVM: x86/mmu: Fix pf_fixed count in tdp_mmu_map_handle_target_level() Currently pf_fixed is not increased when prefault is true. This is not correct, since prefault here really means "async page fault completed". In that case, the original page fault from the guest was morphed into as async page fault and pf_fixed was not increased. So when prefault indicates async page fault is completed, pf_fixed should be increased. Additionally, currently pf_fixed is also increased even when page fault is spurious, while legacy MMU increases pf_fixed when page fault returns RET_PF_EMULATE or RET_PF_FIXED. To fix above two issues, change to increase pf_fixed when return value is not RET_PF_SPURIOUS (RET_PF_RETRY has already been ruled out by reaching here). More information: https://lore.kernel.org/kvm/cover.1620200410.git.kai.huang@intel.com/T/#mbb5f8083e58a2cd262231512b9211cbe70fc3bd5 Fixes: bb18842e2111 ("kvm: x86/mmu: Add TDP MMU PF handler") Reviewed-by: Sean Christopherson Signed-off-by: Kai Huang Message-Id: <2ea8b7f5d4f03c99b32bc56fc982e1e4e3d3fc6b.1623717884.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6c9c6917925a..efb7503ed4d5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -951,7 +951,11 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, rcu_dereference(iter->sptep)); } - if (!prefault) + /* + * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be + * consistent with legacy MMU behavior. + */ + if (ret != RET_PF_SPURIOUS) vcpu->stat.pf_fixed++; return ret; -- cgit v1.2.3 From f1b8325508327a302f1d5cd8a4bf51e2c9c72fa9 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Tue, 15 Jun 2021 12:57:11 +1200 Subject: KVM: x86/mmu: Fix TDP MMU page table level TDP MMU iterator's level is identical to page table's actual level. For instance, for the last level page table (whose entry points to one 4K page), iter->level is 1 (PG_LEVEL_4K), and in case of 5 level paging, the iter->level is mmu->shadow_root_level, which is 5. However, struct kvm_mmu_page's level currently is not set correctly when it is allocated in kvm_tdp_mmu_map(). When iterator hits non-present SPTE and needs to allocate a new child page table, currently iter->level, which is the level of the page table where the non-present SPTE belongs to, is used. This results in struct kvm_mmu_page's level always having its parent's level (excpet root table's level, which is initialized explicitly using mmu->shadow_root_level). This is kinda wrong, and not consistent with existing non TDP MMU code. Fortuantely sp->role.level is only used in handle_removed_tdp_mmu_page() and kvm_tdp_mmu_zap_sp(), and they are already aware of this and behave correctly. However to make it consistent with legacy MMU code (and fix the issue that both root page table and its child page table have shadow_root_level), use iter->level - 1 in kvm_tdp_mmu_map(), and change handle_removed_tdp_mmu_page() and kvm_tdp_mmu_zap_sp() accordingly. Reviewed-by: Ben Gardon Signed-off-by: Kai Huang Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 8 ++++---- arch/x86/kvm/mmu/tdp_mmu.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index efb7503ed4d5..4d658882a4d8 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -337,7 +337,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, for (i = 0; i < PT64_ENT_PER_PAGE; i++) { sptep = rcu_dereference(pt) + i; - gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)); + gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); if (shared) { /* @@ -379,12 +379,12 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, WRITE_ONCE(*sptep, REMOVED_SPTE); } handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, - old_child_spte, REMOVED_SPTE, level - 1, + old_child_spte, REMOVED_SPTE, level, shared); } kvm_flush_remote_tlbs_with_address(kvm, gfn, - KVM_PAGES_PER_HPAGE(level)); + KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -1030,7 +1030,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (is_removed_spte(iter.old_spte)) break; - sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); + sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); child_pt = sp->spt; new_spte = make_nonleaf_spte(child_pt, diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index f7a7990da11d..408aa49731d5 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -31,7 +31,7 @@ static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { - gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level + 1); /* * Don't allow yielding, as the caller may have a flush pending. Note, -- cgit v1.2.3 From 386093c68ba3e8bcfe7f46deba901e0e80713c29 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 8 Mar 2021 14:02:37 +0100 Subject: um: allow not setting extra rpaths in the linux binary There doesn't seem to be any reason for the rpath being set in the binaries, at on systems that I tested on. On the other hand, setting rpath is actually harming binaries in some cases, e.g. if using nix-based compilation environments where /lib & /lib64 are not part of the actual environment. Add a new Kconfig option (under EXPERT, for less user confusion) that allows disabling the rpath additions. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 13 +++++++++++++ arch/um/Makefile | 3 ++- arch/x86/Makefile.um | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 328ba973c07d..fe6d898aeb85 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -106,6 +106,19 @@ config LD_SCRIPT_DYN depends on !LD_SCRIPT_STATIC select MODULE_REL_CRCS if MODVERSIONS +config LD_SCRIPT_DYN_RPATH + bool "set rpath in the binary" if EXPERT + default y + depends on LD_SCRIPT_DYN + help + Add /lib (and /lib64 for 64-bit) to the linux binary's rpath + explicitly. + + You may need to turn this off if compiling for nix systems + that have their libraries in random /nix directories and + might otherwise unexpected use libraries from /lib or /lib64 + instead of the desired ones. + config HOSTFS tristate "Host filesystem" help diff --git a/arch/um/Makefile b/arch/um/Makefile index 1cea46ff9bb7..12a7acef0357 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -118,7 +118,8 @@ archprepare: $(Q)$(MAKE) $(build)=$(HOST_DIR)/um include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie) +LINK-$(CONFIG_LD_SCRIPT_DYN) += $(call cc-option, -no-pie) +LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ -fno-stack-protector $(call cc-option, -fno-stack-protector-all) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 1db7913795f5..b3c1ae084180 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -44,7 +44,7 @@ ELF_FORMAT := elf64-x86-64 # Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example. -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64 +LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib64 LINK-y += -m64 endif -- cgit v1.2.3 From b03fbd4ff24c5f075e58eb19261d5f8b3e40d7c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2021 10:28:12 +0200 Subject: sched: Introduce task_is_running() Replace a bunch of 'p->state == TASK_RUNNING' with a new helper: task_is_running(p). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Geert Uytterhoeven Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210611082838.222401495@infradead.org --- arch/alpha/kernel/process.c | 2 +- arch/arc/kernel/stacktrace.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/csky/kernel/stacktrace.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/hexagon/kernel/process.c | 2 +- arch/ia64/kernel/process.c | 4 ++-- arch/m68k/kernel/process.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/nds32/kernel/process.c | 2 +- arch/nios2/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 4 ++-- arch/powerpc/kernel/process.c | 4 ++-- arch/riscv/kernel/stacktrace.c | 2 +- arch/s390/kernel/process.c | 2 +- arch/s390/mm/fault.c | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sparc/kernel/process_32.c | 3 +-- arch/sparc/kernel/process_64.c | 3 +-- arch/um/kernel/process.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/xtensa/kernel/process.c | 2 +- block/blk-mq.c | 2 +- include/linux/sched.h | 2 ++ kernel/kcsan/report.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/core.c | 6 +++--- kernel/sched/stats.h | 2 +- kernel/signal.c | 2 +- kernel/softirq.c | 3 +-- mm/compaction.c | 2 +- 33 files changed, 40 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 5112ab996394..ef0c08ed0481 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -380,7 +380,7 @@ get_wchan(struct task_struct *p) { unsigned long schedule_frame; unsigned long pc; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; /* * This one depends on the frame size of schedule(). Do a diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index f73da203b170..1b9576d21e24 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -83,7 +83,7 @@ seed_unwind_frame_info(struct task_struct *tsk, struct pt_regs *regs, * is safe-kept and BLINK at a well known location in there */ - if (tsk->state == TASK_RUNNING) + if (task_is_running(tsk)) return -1; frame_info->task = tsk; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 6324f4db9b02..fc9e8b37eaa8 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -288,7 +288,7 @@ unsigned long get_wchan(struct task_struct *p) struct stackframe frame; unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; frame.fp = thread_saved_fp(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index b4bb67f17a2c..14f3c19c6ad2 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -598,7 +598,7 @@ unsigned long get_wchan(struct task_struct *p) struct stackframe frame; unsigned long stack_page, ret = 0; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; stack_page = (unsigned long)try_get_task_stack(p); diff --git a/arch/csky/kernel/stacktrace.c b/arch/csky/kernel/stacktrace.c index 16ae20a0af34..1b280ef08004 100644 --- a/arch/csky/kernel/stacktrace.c +++ b/arch/csky/kernel/stacktrace.c @@ -115,7 +115,7 @@ unsigned long get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && task->state != TASK_RUNNING)) + if (likely(task && task != current && !task_is_running(task))) walk_stackframe(task, NULL, save_wchan, &pc); return pc; } diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 46b1342ce515..2ac27e4248a4 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -134,7 +134,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; stack_page = (unsigned long)p; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index c61165c99ae0..6a6835fb4242 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -135,7 +135,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; stack_page = (unsigned long)task_stack_page(p); diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 7e1a1525e202..e56d63f4abf9 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -529,7 +529,7 @@ get_wchan (struct task_struct *p) unsigned long ip; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; /* @@ -542,7 +542,7 @@ get_wchan (struct task_struct *p) */ unw_init_from_blocked_task(&info, p); do { - if (p->state == TASK_RUNNING) + if (task_is_running(p)) return 0; if (unw_unwind(&info) < 0) return 0; diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index da83cc83e791..db49f9091711 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -268,7 +268,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long fp, pc; unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; stack_page = (unsigned long)task_stack_page(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index bff080db0294..73c8e7990a97 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -662,7 +662,7 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ra = 0; #endif - if (!task || task == current || task->state == TASK_RUNNING) + if (!task || task == current || task_is_running(task)) goto out; if (!task_stack_page(task)) goto out; diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index c1327e552ec6..391895b54d13 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -239,7 +239,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long stack_start, stack_end; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; if (IS_ENABLED(CONFIG_FRAME_POINTER)) { diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index c5f916ca6845..9ff37ba2bb60 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -223,7 +223,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long stack_page; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; stack_page = (unsigned long)p; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index b144fbe29bc1..184ec3c1eae4 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -249,7 +249,7 @@ get_wchan(struct task_struct *p) unsigned long ip; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; /* @@ -260,7 +260,7 @@ get_wchan(struct task_struct *p) do { if (unwind_once(&info) < 0) return 0; - if (p->state == TASK_RUNNING) + if (task_is_running(p)) return 0; ip = info.ip; if (!in_sched_functions(ip)) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 89e34aa273e2..8935c5696bce 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2084,7 +2084,7 @@ static unsigned long __get_wchan(struct task_struct *p) unsigned long ip, sp; int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; sp = p->thread.ksp; @@ -2094,7 +2094,7 @@ static unsigned long __get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || - p->state == TASK_RUNNING) + task_is_running(p)) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index bde85fc53357..ff467b98c3e3 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -132,7 +132,7 @@ unsigned long get_wchan(struct task_struct *task) { unsigned long pc = 0; - if (likely(task && task != current && task->state != TASK_RUNNING)) + if (likely(task && task != current && !task_is_running(task))) walk_stackframe(task, NULL, save_wchan, &pc); return pc; } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e20bed1ed34a..7ae5dde9c54d 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -180,7 +180,7 @@ unsigned long get_wchan(struct task_struct *p) struct unwind_state state; unsigned long ip = 0; - if (!p || p == current || p->state == TASK_RUNNING || !task_stack_page(p)) + if (!p || p == current || task_is_running(p) || !task_stack_page(p)) return 0; if (!try_get_task_stack(p)) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 826d01777361..8ae3dc5783fd 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -702,7 +702,7 @@ static void pfault_interrupt(struct ext_code ext_code, * interrupt since it must be a leftover of a PFAULT * CANCEL operation which didn't remove all pending * completion interrupts. */ - if (tsk->state == TASK_RUNNING) + if (task_is_running(tsk)) tsk->thread.pfault_wait = -1; } } else { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 1aa508eb0823..717de05c81f4 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -186,7 +186,7 @@ unsigned long get_wchan(struct task_struct *p) { unsigned long pc; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; /* diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 3b9794978e5b..93983d6d431d 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -376,8 +376,7 @@ unsigned long get_wchan(struct task_struct *task) struct reg_window32 *rw; int count = 0; - if (!task || task == current || - task->state == TASK_RUNNING) + if (!task || task == current || task_is_running(task)) goto out; fp = task_thread_info(task)->ksp + bias; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 7afd0a859a78..d33c58a58d4f 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -674,8 +674,7 @@ unsigned long get_wchan(struct task_struct *task) unsigned long ret = 0; int count = 0; - if (!task || task == current || - task->state == TASK_RUNNING) + if (!task || task == current || task_is_running(task)) goto out; tp = task_thread_info(task); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index c5011064b5dd..457a38db368b 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -369,7 +369,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long stack_page, sp, ip; bool seen_sched = 0; - if ((p == NULL) || (p == current) || (p->state == TASK_RUNNING)) + if ((p == NULL) || (p == current) || task_is_running(p)) return 0; stack_page = (unsigned long) task_stack_page(p); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e1f38179f49..e52b208b4641 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -931,7 +931,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; - if (p == current || p->state == TASK_RUNNING) + if (p == current || task_is_running(p)) return 0; if (!try_get_task_stack(p)) @@ -975,7 +975,7 @@ unsigned long get_wchan(struct task_struct *p) goto out; } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); - } while (count++ < 16 && p->state != TASK_RUNNING); + } while (count++ < 16 && !task_is_running(p)); out: put_task_stack(p); diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 9534ef515d74..060165340612 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -304,7 +304,7 @@ unsigned long get_wchan(struct task_struct *p) unsigned long stack_page = (unsigned long) task_stack_page(p); int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) + if (!p || p == current || task_is_running(p)) return 0; sp = p->thread.sp; diff --git a/block/blk-mq.c b/block/blk-mq.c index c86c01bfecdb..655db5fb46d0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3926,7 +3926,7 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); - if (current->state == TASK_RUNNING) + if (task_is_running(current)) return 1; if (ret < 0 || !spin) break; diff --git a/include/linux/sched.h b/include/linux/sched.h index ac5a7d29fd4f..2cd56352dae1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -113,6 +113,8 @@ struct task_group; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) +#define task_is_running(task) (READ_ONCE((task)->state) == TASK_RUNNING) + #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 13dce3c664d6..56016e8e7461 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -460,7 +460,7 @@ static void set_other_info_task_blocking(unsigned long *flags, * We may be instrumenting a code-path where current->state is already * something other than TASK_RUNNING. */ - const bool is_running = current->state == TASK_RUNNING; + const bool is_running = task_is_running(current); /* * To avoid deadlock in case we are in an interrupt here and this is a * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 7641bd407239..4931a93c5162 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -760,7 +760,7 @@ static void lockdep_print_held_locks(struct task_struct *p) * It's not reliable to print a task's held locks if it's not sleeping * and it's not the current task. */ - if (p->state == TASK_RUNNING && p != current) + if (p != current && task_is_running(p)) return; for (i = 0; i < depth; i++) { printk(" #%d: ", i); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ad0156b86937..4d6962048c30 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2768,7 +2768,7 @@ EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); #ifdef CONFIG_SMP static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) { - return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : ""; + return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; } #else // #ifdef CONFIG_SMP static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75655cdee3bb..618c2b5a5758 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5974,7 +5974,7 @@ static inline void sched_submit_work(struct task_struct *tsk) { unsigned int task_flags; - if (!tsk->state) + if (task_is_running(tsk)) return; task_flags = tsk->flags; @@ -7949,7 +7949,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || p->state) + if (task_running(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -8152,7 +8152,7 @@ void sched_show_task(struct task_struct *p) pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); - if (p->state == TASK_RUNNING) + if (task_is_running(p)) pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 111072ee9663..d8f8eb0c655b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -217,7 +217,7 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) rq_sched_info_depart(rq, delta); - if (t->state == TASK_RUNNING) + if (task_is_running(t)) sched_info_enqueue(rq, t); } diff --git a/kernel/signal.c b/kernel/signal.c index f7c6ffcbd044..5fc8fcf70c24 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4719,7 +4719,7 @@ void kdb_send_sig(struct task_struct *t, int sig) } new_t = kdb_prev_t != t; kdb_prev_t = t; - if (t->state != TASK_RUNNING && new_t) { + if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" diff --git a/kernel/softirq.c b/kernel/softirq.c index 5ddc3b15a4db..f3a012179f47 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -92,8 +92,7 @@ static bool ksoftirqd_running(unsigned long pending) if (pending & SOFTIRQ_NOW_MASK) return false; - return tsk && (tsk->state == TASK_RUNNING) && - !__kthread_should_park(tsk); + return tsk && task_is_running(tsk) && !__kthread_should_park(tsk); } #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/mm/compaction.c b/mm/compaction.c index 84fde270ae74..725f564a5664 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1955,7 +1955,7 @@ static inline bool is_via_compact_memory(int order) static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); + return pgdat->kswapd && task_is_running(pgdat->kswapd); } /* -- cgit v1.2.3 From 23f079c2494e9b25048db970b1f4dadf19c3c990 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jun 2021 09:45:32 -0700 Subject: KVM: VMX: Refuse to load kvm_intel if EPT and NX are disabled Refuse to load KVM if NX support is not available and EPT is not enabled. Shadow paging has assumed NX support since commit 9167ab799362 ("KVM: vmx, svm: always run with EFER.NXE=1 when shadow paging is active"), so for all intents and purposes this has been a de facto requirement for over a year. Do not require NX support if EPT is enabled purely because Intel CPUs let firmware disable NX support via MSR_IA32_MISC_ENABLES. If not for that, VMX (and KVM as a whole) could require NX support with minimal risk to breaking userspace. Fixes: 9167ab799362 ("KVM: vmx, svm: always run with EFER.NXE=1 when shadow paging is active") Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-Id: <20210615164535.2146172-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 68a72c80bd3f..889e83f71235 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7723,6 +7723,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_invept_global()) enable_ept = 0; + /* NX support is required for shadow paging. */ + if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; -- cgit v1.2.3 From b26a71a1a5b93531bd93305c9c0c7eae2d5cace1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jun 2021 09:45:33 -0700 Subject: KVM: SVM: Refuse to load kvm_amd if NX support is not available Refuse to load KVM if NX support is not available. Shadow paging has assumed NX support since commit 9167ab799362 ("KVM: vmx, svm: always run with EFER.NXE=1 when shadow paging is active"), and NPT has assumed NX support since commit b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation"). While the NX huge pages mitigation should not be enabled by default for AMD CPUs, it can be turned on by userspace at will. Unlike Intel CPUs, AMD does not provide a way for firmware to disable NX support, and Linux always sets EFER.NX=1 if it is supported. Given that it's extremely unlikely that a CPU supports NPT but not NX, making NX a formal requirement is far simpler than adding requirements to the mitigation flow. Fixes: 9167ab799362 ("KVM: vmx, svm: always run with EFER.NXE=1 when shadow paging is active") Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-Id: <20210615164535.2146172-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b6afa6b63c8f..12c06ea28f5c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -952,6 +952,16 @@ static __init int svm_hardware_setup(void) int r; unsigned int order = get_order(IOPM_SIZE); + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + iopm_pages = alloc_pages(GFP_KERNEL, order); if (!iopm_pages) @@ -965,9 +975,6 @@ static __init int svm_hardware_setup(void) supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - if (boot_cpu_has(X86_FEATURE_NX)) - kvm_enable_efer_bits(EFER_NX); - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); -- cgit v1.2.3 From 8bbed95d2cb6e5de8a342d761a89b0a04faed7be Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 15 Jun 2021 09:45:34 -0700 Subject: KVM: x86: WARN and reject loading KVM if NX is supported but not enabled WARN if NX is reported as supported but not enabled in EFER. All flavors of the kernel, including non-PAE 32-bit kernels, set EFER.NX=1 if NX is supported, even if NX usage is disable via kernel command line. KVM relies on NX being enabled if it's supported, e.g. KVM will generate illegal NPT entries if nx_huge_pages is enabled and NX is supported but not enabled. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Message-Id: <20210615164535.2146172-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b898ec8d349..76dae88cf524 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10920,6 +10920,9 @@ int kvm_arch_hardware_setup(void *opaque) int r; rdmsrl_safe(MSR_EFER, &host_efer); + if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) && + !(host_efer & EFER_NX))) + return -EIO; if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); -- cgit v1.2.3 From c62efff28bb5eb60d60415a0dd0c864c64be0671 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Jun 2021 06:42:10 -0400 Subject: KVM: x86: Stub out is_tdp_mmu_root on 32-bit hosts If is_tdp_mmu_root is not inlined, the elimination of TDP MMU calls as dead code might not work out. To avoid this, explicitly declare the stubbed is_tdp_mmu_root on 32-bit hosts. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 408aa49731d5..78d8a296f0b6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -85,12 +85,6 @@ bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -#else -static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } -static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } -static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -#endif static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) { @@ -107,5 +101,12 @@ static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) return is_tdp_mmu_page(sp) && sp->root_count; } +#else +static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } +static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } +static inline bool is_tdp_mmu_root(hpa_t hpa) { return false; } +#endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- cgit v1.2.3 From aa23c0ad14228ccfcd0b6f799dd34b348a5f2b1e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:45 +0000 Subject: KVM: x86/mmu: Remove redundant is_tdp_mmu_root check The check for is_tdp_mmu_root in kvm_tdp_mmu_map is redundant because kvm_tdp_mmu_map's only caller (direct_page_fault) already checks is_tdp_mmu_root. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-2-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4d658882a4d8..d4c254dc4d5f 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -985,8 +985,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); -- cgit v1.2.3 From 0b873fd7fb53ed7343ee7ee166e1373aec02a9cb Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:46 +0000 Subject: KVM: x86/mmu: Remove redundant is_tdp_mmu_enabled check This check is redundant because the root shadow page will only be a TDP MMU page if is_tdp_mmu_enabled() returns true, and is_tdp_mmu_enabled() never changes for the lifetime of a VM. It's possible that this check was added for performance reasons but it is unlikely that it is useful in practice since to_shadow_page() is cheap. That being said, this patch also caches the return value of is_tdp_mmu_root() in direct_page_fault() since there's no reason to duplicate the call so many times, so performance is not a concern. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-3-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- arch/x86/kvm/mmu/tdp_mmu.h | 4 +--- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7d3e57678d34..10c1c2029d35 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3608,7 +3608,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_root(vcpu->arch.mmu->root_hpa)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3780,6 +3780,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { + bool is_tdp_mmu_fault = is_tdp_mmu_root(vcpu->arch.mmu->root_hpa); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; @@ -3792,7 +3793,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) { + if (!is_tdp_mmu_fault) { r = fast_page_fault(vcpu, gpa, error_code); if (r != RET_PF_INVALID) return r; @@ -3814,7 +3815,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, r = RET_PF_RETRY; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_lock(&vcpu->kvm->mmu_lock); else write_lock(&vcpu->kvm->mmu_lock); @@ -3825,7 +3826,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (r) goto out_unlock; - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, pfn, prefault); else @@ -3833,7 +3834,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, prefault, is_tdp); out_unlock: - if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 78d8a296f0b6..f6e0667cf4b6 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -86,12 +86,10 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) +static inline bool is_tdp_mmu_root(hpa_t hpa) { struct kvm_mmu_page *sp; - if (!is_tdp_mmu_enabled(kvm)) - return false; if (WARN_ON(!VALID_PAGE(hpa))) return false; -- cgit v1.2.3 From 63c0cac938edfa5d72bfbe8f1eeb9d47b397829c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:47 +0000 Subject: KVM: x86/mmu: Refactor is_tdp_mmu_root into is_tdp_mmu This change simplifies the call sites slightly and also abstracts away the implementation detail of looking at root_hpa as the mechanism for determining if the mmu is the TDP MMU. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/mmu/tdp_mmu.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 10c1c2029d35..f1dd8308f080 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3608,7 +3608,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } - if (is_tdp_mmu_root(vcpu->arch.mmu->root_hpa)) + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); @@ -3780,7 +3780,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, bool prefault, int max_level, bool is_tdp) { - bool is_tdp_mmu_fault = is_tdp_mmu_root(vcpu->arch.mmu->root_hpa); + bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); bool write = error_code & PFERR_WRITE_MASK; bool map_writable; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index f6e0667cf4b6..b981a044ab55 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -86,9 +86,10 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } -static inline bool is_tdp_mmu_root(hpa_t hpa) +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { struct kvm_mmu_page *sp; + hpa_t hpa = mmu->root_hpa; if (WARN_ON(!VALID_PAGE(hpa))) return false; @@ -104,7 +105,7 @@ static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } -static inline bool is_tdp_mmu_root(hpa_t hpa) { return false; } +static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif #endif /* __KVM_X86_MMU_TDP_MMU_H */ -- cgit v1.2.3 From 0485cf8dbe964b6cc485178da6ee8ae7b2d0d15c Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 17 Jun 2021 23:19:48 +0000 Subject: KVM: x86/mmu: Remove redundant root_hpa checks The root_hpa checks below the top-level check in kvm_mmu_page_fault are theoretically redundant since there is no longer a way for the root_hpa to be reset during a page fault. The details of why are described in commit ddce6208217c ("KVM: x86/mmu: Move root_hpa validity checks to top of page fault handler") __direct_map, kvm_tdp_mmu_map, and get_mmio_spte are all only reachable through kvm_mmu_page_fault, therefore their root_hpa checks are redundant. Suggested-by: Sean Christopherson Signed-off-by: David Matlack Message-Id: <20210617231948.2591431-5-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 -------- arch/x86/kvm/mmu/tdp_mmu.c | 3 --- 2 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f1dd8308f080..84d48a33e38b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2859,9 +2859,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, gfn_t gfn = gpa >> PAGE_SHIFT; gfn_t base_gfn = gfn; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); @@ -3603,11 +3600,6 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { - *sptep = 0ull; - return reserved; - } - if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d4c254dc4d5f..caac4ddb46df 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -983,9 +983,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int level; int req_level; - if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) - return RET_PF_RETRY; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, huge_page_disallowed, &req_level); -- cgit v1.2.3 From cacf994a91d3a55c0c2f853d6429cd7b86113915 Mon Sep 17 00:00:00 2001 From: Mikel Rychliski Date: Fri, 11 Jun 2021 17:48:23 -0400 Subject: PCI: Add AMD RS690 quirk to enable 64-bit DMA Although the AMD RS690 chipset has 64-bit DMA support, BIOS implementations sometimes fail to configure the memory limit registers correctly. The Acer F690GVM mainboard uses this chipset and a Marvell 88E8056 NIC. The sky2 driver programs the NIC to use 64-bit DMA, which will not work: sky2 0000:02:00.0: error interrupt status=0x8 sky2 0000:02:00.0 eth0: tx timeout sky2 0000:02:00.0 eth0: transmit ring 0 .. 22 report=0 done=0 Other drivers required by this mainboard either don't support 64-bit DMA, or have it disabled using driver specific quirks. For example, the ahci driver has quirks to enable or disable 64-bit DMA depending on the BIOS version (see ahci_sb600_enable_64bit() in ahci.c). This ahci quirk matches against the SB600 SATA controller, but the real issue is almost certainly with the RS690 PCI host that it was commonly attached to. To avoid this issue in all drivers with 64-bit DMA support, fix the configuration of the PCI host. If the kernel is aware of physical memory above 4GB, but the BIOS never configured the PCI host with this information, update the registers with our values. [bhelgaas: drop PCI_DEVICE_ID_ATI_RS690 definition] Link: https://lore.kernel.org/r/20210611214823.4898-1-mikel@mikelr.com Signed-off-by: Mikel Rychliski Signed-off-by: Bjorn Helgaas --- arch/x86/pci/fixup.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 02dc64625e64..2edd86649468 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -779,4 +779,48 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar); DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar); +#define RS690_LOWER_TOP_OF_DRAM2 0x30 +#define RS690_LOWER_TOP_OF_DRAM2_VALID 0x1 +#define RS690_UPPER_TOP_OF_DRAM2 0x31 +#define RS690_HTIU_NB_INDEX 0xA8 +#define RS690_HTIU_NB_INDEX_WR_ENABLE 0x100 +#define RS690_HTIU_NB_DATA 0xAC + +/* + * Some BIOS implementations support RAM above 4GB, but do not configure the + * PCI host to respond to bus master accesses for these addresses. These + * implementations set the TOP_OF_DRAM_SLOT1 register correctly, so PCI DMA + * works as expected for addresses below 4GB. + * + * Reference: "AMD RS690 ASIC Family Register Reference Guide" (pg. 2-57) + * https://www.amd.com/system/files/TechDocs/43372_rs690_rrg_3.00o.pdf + */ +static void rs690_fix_64bit_dma(struct pci_dev *pdev) +{ + u32 val = 0; + phys_addr_t top_of_dram = __pa(high_memory - 1) + 1; + + if (top_of_dram <= (1ULL << 32)) + return; + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_LOWER_TOP_OF_DRAM2); + pci_read_config_dword(pdev, RS690_HTIU_NB_DATA, &val); + + if (val) + return; + + pci_info(pdev, "Adjusting top of DRAM to %pa for 64-bit DMA support\n", &top_of_dram); + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_UPPER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE); + pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, top_of_dram >> 32); + + pci_write_config_dword(pdev, RS690_HTIU_NB_INDEX, + RS690_LOWER_TOP_OF_DRAM2 | RS690_HTIU_NB_INDEX_WR_ENABLE); + pci_write_config_dword(pdev, RS690_HTIU_NB_DATA, + top_of_dram | RS690_LOWER_TOP_OF_DRAM2_VALID); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma); + #endif -- cgit v1.2.3 From 28e5e44aa3f4e0e0370864ed008fb5e2d85f4dc8 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Thu, 17 Jun 2021 12:46:57 -0700 Subject: x86/mm: Avoid truncating memblocks for SGX memory tl;dr: Several SGX users reported seeing the following message on NUMA systems: sgx: [Firmware Bug]: Unable to map EPC section to online node. Fallback to the NUMA node 0. This turned out to be the memblock code mistakenly throwing away SGX memory. === Full Changelog === The 'max_pfn' variable represents the highest known RAM address. It can be used, for instance, to quickly determine for which physical addresses there is mem_map[] space allocated. The numa_meminfo code makes an effort to throw out ("trim") all memory blocks which are above 'max_pfn'. SGX memory is not considered RAM (it is marked as "Reserved" in the e820) and is not taken into account by max_pfn. Despite this, SGX memory areas have NUMA affinity and are enumerated in the ACPI SRAT table. The existing SGX code uses the numa_meminfo mechanism to look up the NUMA affinity for its memory areas. In cases where SGX memory was above max_pfn (usually just the one EPC section in the last highest NUMA node), the numa_memblock is truncated at 'max_pfn', which is below the SGX memory. When the SGX code tries to look up the affinity of this memory, it fails and produces an error message: sgx: [Firmware Bug]: Unable to map EPC section to online node. Fallback to the NUMA node 0. and assigns the memory to NUMA node 0. Instead of silently truncating the memory block at 'max_pfn' and dropping the SGX memory, add the truncated portion to 'numa_reserved_meminfo'. This allows the SGX code to later determine the NUMA affinity of its 'Reserved' area. Before, numa_meminfo looked like this (from 'crash'): blk = { start = 0x0, end = 0x2080000000, nid = 0x0 } { start = 0x2080000000, end = 0x4000000000, nid = 0x1 } numa_reserved_meminfo is empty. With this, numa_meminfo looks like this: blk = { start = 0x0, end = 0x2080000000, nid = 0x0 } { start = 0x2080000000, end = 0x4000000000, nid = 0x1 } and numa_reserved_meminfo has an entry for node 1's SGX memory: blk = { start = 0x4000000000, end = 0x4080000000, nid = 0x1 } [ daveh: completely rewrote/reworked changelog ] Fixes: 5d30f92e7631 ("x86/NUMA: Provide a range-to-target_node lookup facility") Reported-by: Reinette Chatre Signed-off-by: Fan Du Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov Reviewed-by: Jarkko Sakkinen Reviewed-by: Dan Williams Reviewed-by: Dave Hansen Cc: Link: https://lkml.kernel.org/r/20210617194657.0A99CB22@viggo.jf.intel.com --- arch/x86/mm/numa.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 5eb4dc2b97da..e94da744386f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -254,7 +254,13 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); - bi->end = min(bi->end, high); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end = high; + } /* and there's no empty block */ if (bi->start >= bi->end) -- cgit v1.2.3 From d187f217335dba2b49fc9002aab2004e04acddee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 18 Jun 2021 13:54:08 +0200 Subject: x86/sev: Make sure IRQs are disabled while GHCB is active The #VC handler only cares about IRQs being disabled while the GHCB is active, as it must not be interrupted by something which could cause another #VC while it holds the GHCB (NMI is the exception for which the backup GHCB exits). Make sure nothing interrupts the code path while the GHCB is active by making sure that callers of __sev_{get,put}_ghcb() have disabled interrupts upfront. [ bp: Massage commit message. ] Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210618115409.22735-2-joro@8bytes.org --- arch/x86/kernel/sev.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 8178db07a06a..9f32cbb773d9 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -12,7 +12,6 @@ #include /* For show_regs() */ #include #include -#include #include #include #include @@ -192,11 +191,19 @@ void noinstr __sev_es_ist_exit(void) this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); } -static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) +/* + * Nothing shall interrupt this code path while holding the per-CPU + * GHCB. The backup GHCB is only for NMIs interrupting this path. + * + * Callers must disable local interrupts around it. + */ +static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) { struct sev_es_runtime_data *data; struct ghcb *ghcb; + WARN_ON(!irqs_disabled()); + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; @@ -213,7 +220,9 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state) data->ghcb_active = false; data->backup_ghcb_active = false; + instrumentation_begin(); panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); + instrumentation_end(); } /* Mark backup_ghcb active before writing to it */ @@ -486,11 +495,13 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt /* Include code shared with pre-decompression boot stage */ #include "sev-shared.c" -static __always_inline void sev_es_put_ghcb(struct ghcb_state *state) +static noinstr void __sev_put_ghcb(struct ghcb_state *state) { struct sev_es_runtime_data *data; struct ghcb *ghcb; + WARN_ON(!irqs_disabled()); + data = this_cpu_read(runtime_data); ghcb = &data->ghcb_page; @@ -514,7 +525,7 @@ void noinstr __sev_es_nmi_complete(void) struct ghcb_state state; struct ghcb *ghcb; - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); @@ -524,7 +535,7 @@ void noinstr __sev_es_nmi_complete(void) sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); VMGEXIT(); - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); } static u64 get_jump_table_addr(void) @@ -536,7 +547,7 @@ static u64 get_jump_table_addr(void) local_irq_save(flags); - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); @@ -550,7 +561,7 @@ static u64 get_jump_table_addr(void) ghcb_sw_exit_info_2_is_valid(ghcb)) ret = ghcb->save.sw_exit_info_2; - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); local_irq_restore(flags); @@ -675,7 +686,7 @@ static void sev_es_ap_hlt_loop(void) struct ghcb_state state; struct ghcb *ghcb; - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); while (true) { vc_ghcb_invalidate(ghcb); @@ -692,7 +703,7 @@ static void sev_es_ap_hlt_loop(void) break; } - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); } /* @@ -1351,7 +1362,6 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) } irq_state = irqentry_nmi_enter(regs); - lockdep_assert_irqs_disabled(); instrumentation_begin(); /* @@ -1360,7 +1370,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) * keep the IRQs disabled to protect us against concurrent TLB flushes. */ - ghcb = sev_es_get_ghcb(&state); + ghcb = __sev_get_ghcb(&state); vc_ghcb_invalidate(ghcb); result = vc_init_em_ctxt(&ctxt, regs, error_code); @@ -1368,7 +1378,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) if (result == ES_OK) result = vc_handle_exitcode(&ctxt, ghcb, error_code); - sev_es_put_ghcb(&state); + __sev_put_ghcb(&state); /* Done - now check the result */ switch (result) { -- cgit v1.2.3 From be1a5408868af341f61f93c191b5e346ee88c82a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 18 Jun 2021 13:54:09 +0200 Subject: x86/sev: Split up runtime #VC handler for correct state tracking Split up the #VC handler code into a from-user and a from-kernel part. This allows clean and correct state tracking, as the #VC handler needs to enter NMI-state when raised from kernel mode and plain IRQ state when raised from user-mode. Fixes: 62441a1fb532 ("x86/sev-es: Correctly track IRQ states in runtime #VC handler") Suggested-by: Peter Zijlstra Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210618115409.22735-3-joro@8bytes.org --- arch/x86/entry/entry_64.S | 4 +- arch/x86/include/asm/idtentry.h | 29 +++----- arch/x86/kernel/sev.c | 148 +++++++++++++++++++++------------------- 3 files changed, 91 insertions(+), 90 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a16a5294d55f..1886aaf19914 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -506,7 +506,7 @@ SYM_CODE_START(\asmsym) movq %rsp, %rdi /* pt_regs pointer */ - call \cfunc + call kernel_\cfunc /* * No need to switch back to the IST stack. The current stack is either @@ -517,7 +517,7 @@ SYM_CODE_START(\asmsym) /* Switch to the regular task stack */ .Lfrom_usermode_switch_stack_\@: - idtentry_body safe_stack_\cfunc, has_error_code=1 + idtentry_body user_\cfunc, has_error_code=1 _ASM_NOKPROBE(\asmsym) SYM_CODE_END(\asmsym) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 73d45b0dfff2..cd9f3e304944 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -312,8 +312,8 @@ static __always_inline void __##func(struct pt_regs *regs) */ #define DECLARE_IDTENTRY_VC(vector, func) \ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \ - __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \ - __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code) + __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \ + __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code) /** * DEFINE_IDTENTRY_IST - Emit code for IST entry points @@ -355,33 +355,24 @@ static __always_inline void __##func(struct pt_regs *regs) DEFINE_IDTENTRY_RAW_ERRORCODE(func) /** - * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler - which runs on a safe stack. + * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler + when raised from kernel mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE */ -#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func) +#define DEFINE_IDTENTRY_VC_KERNEL(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func) /** - * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler - which runs on the VC fall-back stack + * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler + when raised from user mode * @func: Function name of the entry point * * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE */ -#define DEFINE_IDTENTRY_VC_IST(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func) - -/** - * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler - * @func: Function name of the entry point - * - * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE - */ -#define DEFINE_IDTENTRY_VC(func) \ - DEFINE_IDTENTRY_RAW_ERRORCODE(func) +#define DEFINE_IDTENTRY_VC_USER(func) \ + DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func) #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 9f32cbb773d9..87a4b00f028e 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -793,7 +793,7 @@ void __init sev_es_init_vc_handling(void) sev_es_setup_play_dead(); /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication; + initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; } static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) @@ -1231,14 +1231,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, return ES_EXCEPTION; } -static __always_inline void vc_handle_trap_db(struct pt_regs *regs) -{ - if (user_mode(regs)) - noist_exc_debug(regs); - else - exc_debug(regs); -} - static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, struct ghcb *ghcb, unsigned long exit_code) @@ -1334,41 +1326,13 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs) return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); } -/* - * Main #VC exception handler. It is called when the entry code was able to - * switch off the IST to a safe kernel stack. - * - * With the current implementation it is always possible to switch to a safe - * stack because #VC exceptions only happen at known places, like intercepted - * instructions or accesses to MMIO areas/IO ports. They can also happen with - * code instrumentation when the hypervisor intercepts #DB, but the critical - * paths are forbidden to be instrumented, so #DB exceptions currently also - * only happen in safe places. - */ -DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) +static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) { - irqentry_state_t irq_state; struct ghcb_state state; struct es_em_ctxt ctxt; enum es_result result; struct ghcb *ghcb; - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) { - vc_handle_trap_db(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - instrumentation_begin(); - - /* - * This is invoked through an interrupt gate, so IRQs are disabled. The - * code below might walk page-tables for user or kernel addresses, so - * keep the IRQs disabled to protect us against concurrent TLB flushes. - */ + bool ret = true; ghcb = __sev_get_ghcb(&state); @@ -1388,15 +1352,18 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) case ES_UNSUPPORTED: pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_VMM_ERROR: pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_DECODE_FAILED: pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", error_code, regs->ip); - goto fail; + ret = false; + break; case ES_EXCEPTION: vc_forward_exception(&ctxt); break; @@ -1412,24 +1379,52 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication) BUG(); } -out: - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); + return ret; +} - return; +static __always_inline bool vc_is_db(unsigned long error_code) +{ + return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; +} -fail: - if (user_mode(regs)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } else { - pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n", - result); +/* + * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode + * and will panic when an error happens. + */ +DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) +{ + irqentry_state_t irq_state; + + /* + * With the current implementation it is always possible to switch to a + * safe stack because #VC exceptions only happen at known places, like + * intercepted instructions or accesses to MMIO areas/IO ports. They can + * also happen with code instrumentation when the hypervisor intercepts + * #DB, but the critical paths are forbidden to be instrumented, so #DB + * exceptions currently also only happen in safe places. + * + * But keep this here in case the noinstr annotations are violated due + * to bug elsewhere. + */ + if (unlikely(on_vc_fallback_stack(regs))) { + instrumentation_begin(); + panic("Can't handle #VC exception from unsupported context\n"); + instrumentation_end(); + } + + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + exc_debug(regs); + return; + } + + irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); + + if (!vc_raw_handle_exception(regs, error_code)) { /* Show some debug info */ show_regs(regs); @@ -1440,23 +1435,38 @@ fail: panic("Returned from Terminate-Request to Hypervisor\n"); } - goto out; + instrumentation_end(); + irqentry_nmi_exit(regs, irq_state); } -/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */ -DEFINE_IDTENTRY_VC_IST(exc_vmm_communication) +/* + * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode + * and will kill the current task with SIGBUS when an error happens. + */ +DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) { + /* + * Handle #DB before calling into !noinstr code to avoid recursive #DB. + */ + if (vc_is_db(error_code)) { + noist_exc_debug(regs); + return; + } + + irqentry_enter_from_user_mode(regs); instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); -} -DEFINE_IDTENTRY_VC(exc_vmm_communication) -{ - if (likely(!on_vc_fallback_stack(regs))) - safe_stack_exc_vmm_communication(regs, error_code); - else - ist_exc_vmm_communication(regs, error_code); + if (!vc_raw_handle_exception(regs, error_code)) { + /* + * Do not kill the machine if user-space triggered the + * exception. Send SIGBUS instead and let user-space deal with + * it. + */ + force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); + } + + instrumentation_end(); + irqentry_exit_to_user_mode(regs); } bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -- cgit v1.2.3 From 31197d3a0f1caeb60fb01f6755e28347e4f44037 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Jun 2021 16:13:55 +0200 Subject: objtool/x86: Ignore __x86_indirect_alt_* symbols Because the __x86_indirect_alt* symbols are just that, objtool will try and validate them as regular symbols, instead of the alternative replacements that they are. This goes sideways for FRAME_POINTER=y builds; which generate a fair amount of warnings. Fixes: 9bc0bb50727c ("objtool/x86: Rewrite retpoline thunk calls") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/YNCgxwLBiK9wclYJ@hirez.programming.kicks-ass.net --- arch/x86/lib/retpoline.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 4d32cb06ffd5..ec9922cba30a 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -58,12 +58,16 @@ SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg) 2: .skip 5-(2b-1b), 0x90 SYM_FUNC_END(__x86_indirect_alt_call_\reg) +STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg) + SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg) ANNOTATE_RETPOLINE_SAFE 1: jmp *%\reg 2: .skip 5-(2b-1b), 0x90 SYM_FUNC_END(__x86_indirect_alt_jmp_\reg) +STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg) + .endm /* -- cgit v1.2.3 From 5140bc7d6bc8abad58b4f2a2c011607bfd922992 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 18 Jun 2021 16:59:41 -0700 Subject: KVM: VMX: Skip #PF(RSVD) intercepts when emulating smaller maxphyaddr As part of smaller maxphyaddr emulation, kvm needs to intercept present page faults to see if it needs to add the RSVD flag (bit 3) to the error code. However, there is no need to intercept page faults that already have the RSVD flag set. When setting up the page fault intercept, add the RSVD flag into the #PF error code mask field (but not the #PF error code match field) to skip the intercept when the RSVD flag is already set. Signed-off-by: Jim Mattson Message-Id: <20210618235941.1041604-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 889e83f71235..ab6f682645d7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -747,16 +747,21 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; else { - /* - * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched - * between guest and host. In that case we only care about present - * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in - * prepare_vmcs02_rare. - */ - bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR)); - int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0; + int mask = 0, match = 0; + + if (enable_ept && (eb & (1u << PF_VECTOR))) { + /* + * If EPT is enabled, #PF is currently only intercepted + * if MAXPHYADDR is smaller on the guest than on the + * host. In that case we only care about present, + * non-reserved faults. For vmcs02, however, PFEC_MASK + * and PFEC_MATCH are set in prepare_vmcs02_rare. + */ + mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; + match = PFERR_PRESENT_MASK; + } vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); } vmcs_write32(EXCEPTION_BITMAP, eb); -- cgit v1.2.3 From ba1f82456ba8438a8abc96274d57bfe76d34a4a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 18 Jun 2021 14:46:58 -0700 Subject: KVM: nVMX: Dynamically compute max VMCS index for vmcs12 Calculate the max VMCS index for vmcs12 by walking the array to find the actual max index. Hardcoding the index is prone to bitrot, and the calculation is only done on KVM bringup (albeit on every CPU, but there aren't _that_ many null entries in the array). Fixes: 3c0f99366e34 ("KVM: nVMX: Add a TSC multiplier field in VMCS12") Signed-off-by: Sean Christopherson Message-Id: <20210618214658.2700765-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 37 +++++++++++++++++++++++++++++++++++-- arch/x86/kvm/vmx/vmcs.h | 8 ++++++++ arch/x86/kvm/vmx/vmcs12.h | 6 ------ 3 files changed, 43 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b531e08a095b..183fd9d62fc5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6374,6 +6374,40 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void) } } +/* + * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo + * that madness to get the encoding for comparison. + */ +#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) + +static u64 nested_vmx_calc_vmcs_enum_msr(void) +{ + /* + * Note these are the so called "index" of the VMCS field encoding, not + * the index into vmcs12. + */ + unsigned int max_idx, idx; + int i; + + /* + * For better or worse, KVM allows VMREAD/VMWRITE to all fields in + * vmcs12, regardless of whether or not the associated feature is + * exposed to L1. Simply find the field with the highest index. + */ + max_idx = 0; + for (i = 0; i < nr_vmcs12_fields; i++) { + /* The vmcs12 table is very, very sparsely populated. */ + if (!vmcs_field_to_offset_table[i]) + continue; + + idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); + if (idx > max_idx) + max_idx = idx; + } + + return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; +} + /* * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be * returned for the various VMX controls MSRs when nested VMX is enabled. @@ -6619,8 +6653,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); - /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; + msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } void nested_vmx_hardware_unsetup(void) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 1472c6c376f7..de3b04d4b587 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -164,4 +164,12 @@ static inline int vmcs_field_readonly(unsigned long field) return (((field >> 10) & 0x3) == 1); } +#define VMCS_FIELD_INDEX_SHIFT (1) +#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1) + +static inline unsigned int vmcs_field_index(unsigned long field) +{ + return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT; +} + #endif /* __KVM_X86_VMX_VMCS_H */ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index bb81a23afe89..5e0e1b39f495 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -205,12 +205,6 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE -/* - * VMCS12_MAX_FIELD_INDEX is the highest index value used in any - * supported VMCS12 field encoding. - */ -#define VMCS12_MAX_FIELD_INDEX 0x17 - /* * For save/restore compatibility, the vmcs12 field offsets must not change. */ -- cgit v1.2.3 From 9301982c424a003c0095bf157154a85bf5322bd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jun 2021 16:18:24 +0200 Subject: x86/fpu: Preserve supervisor states in sanitize_restored_user_xstate() sanitize_restored_user_xstate() preserves the supervisor states only when the fx_only argument is zero, which allows unprivileged user space to put supervisor states back into init state. Preserve them unconditionally. [ bp: Fix a typo or two in the text. ] Fixes: 5d6b6a6f9b5c ("x86/fpu/xstate: Update sanitize_restored_xstate() for supervisor xstates") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210618143444.438635017@linutronix.de --- arch/x86/kernel/fpu/signal.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ec3ae3054792..b7b92cdf3add 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -221,28 +221,18 @@ sanitize_restored_user_xstate(union fpregs_state *state, if (use_xsave()) { /* - * Note: we don't need to zero the reserved bits in the - * xstate_header here because we either didn't copy them at all, - * or we checked earlier that they aren't set. + * Clear all feature bits which are not set in + * user_xfeatures and clear all extended features + * for fx_only mode. */ + u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures; /* - * 'user_xfeatures' might have bits clear which are - * set in header->xfeatures. This represents features that - * were in init state prior to a signal delivery, and need - * to be reset back to the init state. Clear any user - * feature bits which are set in the kernel buffer to get - * them back to the init state. - * - * Supervisor state is unchanged by input from userspace. - * Ensure supervisor state bits stay set and supervisor - * state is not modified. + * Supervisor state has to be preserved. The sigframe + * restore can only modify user features, i.e. @mask + * cannot contain them. */ - if (fx_only) - header->xfeatures = XFEATURE_MASK_FPSSE; - else - header->xfeatures &= user_xfeatures | - xfeatures_mask_supervisor(); + header->xfeatures &= mask | xfeatures_mask_supervisor(); } if (use_fxsr()) { -- cgit v1.2.3 From f9dfb5e390fab2df9f7944bb91e7705aba14cd26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jun 2021 16:18:25 +0200 Subject: x86/fpu: Make init_fpstate correct with optimized XSAVE The XSAVE init code initializes all enabled and supported components with XRSTOR(S) to init state. Then it XSAVEs the state of the components back into init_fpstate which is used in several places to fill in the init state of components. This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because those use the init optimization and skip writing state of components which are in init state. So init_fpstate.xsave still contains all zeroes after this operation. There are two ways to solve that: 1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when XSAVES is enabled because XSAVES uses compacted format. 2) Save the components which are known to have a non-zero init state by other means. Looking deeper, #2 is the right thing to do because all components the kernel supports have all-zeroes init state except the legacy features (FP, SSE). Those cannot be hard coded because the states are not identical on all CPUs, but they can be saved with FXSAVE which avoids all conditionals. Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with a BUILD_BUG_ON() which reminds developers to validate that a newly added component has all zeroes init state. As a bonus remove the now unused copy_xregs_to_kernel_booting() crutch. The XSAVE and reshuffle method can still be implemented in the unlikely case that components are added which have a non-zero init state and no other means to save them. For now, FXSAVE is just simple and good enough. [ bp: Fix a typo or two in the text. ] Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 30 ++++++++------------------- arch/x86/kernel/fpu/xstate.c | 41 ++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fdee23ea4e17..16bf4d4a8159 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -204,6 +204,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); } +static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" @@ -268,28 +276,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") -/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) -{ - u64 mask = xfeatures_mask_all; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - else - XSTATE_OP(XSAVE, xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d0eef963aad1..1cadb2faf740 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -440,6 +440,25 @@ static void __init print_xstate_offset_size(void) } } +/* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR | \ + XFEATURE_MASK_PASID) + /* * setup the xstate image representing the init state */ @@ -447,6 +466,10 @@ static void __init setup_init_fpu_buf(void) { static int on_boot_cpu __initdata = 1; + BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED) != + XFEATURES_INIT_FPSTATE_HANDLED); + WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -466,10 +489,22 @@ static void __init setup_init_fpu_buf(void) copy_kernel_to_xregs_booting(&init_fpstate.xsave); /* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVES because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVES is available because XSAVES uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. */ - copy_xregs_to_kernel_booting(&init_fpstate.xsave); + fxsave(&init_fpstate.fxsave); } static int xfeature_uncompacted_offset(int xfeature_nr) -- cgit v1.2.3 From 240001d4e3041832e8a2654adc3ccf1683132b92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Jun 2021 13:12:34 +0200 Subject: x86/entry: Fix noinstr fail in __do_fast_syscall_32() Fix: vmlinux.o: warning: objtool: __do_fast_syscall_32()+0xf5: call to trace_hardirqs_off() leaves .noinstr.text section Fixes: 5d5675df792f ("x86/entry: Fix entry/exit mismatch on failed fast 32-bit syscalls") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210621120120.467898710@infradead.org --- arch/x86/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7b2542b13ebd..a6bf516257ec 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -130,8 +130,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* User code screwed up. */ regs->ax = -EFAULT; - instrumentation_end(); local_irq_disable(); + instrumentation_end(); irqentry_exit_to_user_mode(regs); return false; } -- cgit v1.2.3 From 84e60065df9ef03759115a7e48c04bbc0d292165 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Jun 2021 13:12:35 +0200 Subject: x86/xen: Fix noinstr fail in xen_pv_evtchn_do_upcall() Fix: vmlinux.o: warning: objtool: xen_pv_evtchn_do_upcall()+0x23: call to irq_enter_rcu() leaves .noinstr.text section Fixes: 359f01d1816f ("x86/entry: Use run_sysvec_on_irqstack_cond() for XEN upcall") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210621120120.532960208@infradead.org --- arch/x86/entry/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a6bf516257ec..04bce95bc7e3 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -269,15 +269,16 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) irqentry_state_t state = irqentry_enter(regs); bool inhcall; + instrumentation_begin(); run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); inhcall = get_and_clear_inhcall(); if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { - instrumentation_begin(); irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { + instrumentation_end(); irqentry_exit(regs, state); } } -- cgit v1.2.3 From 4c9c26f1e67648f41f28f8c997c5c9467a3dbbe4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Jun 2021 13:12:36 +0200 Subject: x86/xen: Fix noinstr fail in exc_xen_unknown_trap() Fix: vmlinux.o: warning: objtool: exc_xen_unknown_trap()+0x7: call to printk() leaves .noinstr.text section Fixes: 2e92493637a0 ("x86/xen: avoid warning in Xen pv guest with CONFIG_AMD_MEM_ENCRYPT enabled") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210621120120.606560778@infradead.org --- arch/x86/xen/enlighten_pv.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index e87699aa2dc8..03149422dce2 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -592,8 +592,10 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug) DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap) { /* This should never happen and there is no way to handle it. */ + instrumentation_begin(); pr_err("Unknown trap in Xen PV mode."); BUG(); + instrumentation_end(); } #ifdef CONFIG_X86_MCE -- cgit v1.2.3 From 1f008d46f1243899d27fd034ab5c41985bd16cee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Jun 2021 13:12:37 +0200 Subject: x86: Always inline task_size_max() Fix: vmlinux.o: warning: objtool: handle_bug()+0x10: call to task_size_max() leaves .noinstr.text section When #UD isn't a BUG, we shouldn't violate noinstr (we'll still probably die, but that's another story). Fixes: 025768a966a3 ("x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210621120120.682468274@infradead.org --- arch/x86/include/asm/page_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index ca840fec7776..4bde0dc66100 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -75,7 +75,7 @@ void copy_page(void *to, void *from); * * With page table isolation enabled, we map the LDT in ... [stay tuned] */ -static inline unsigned long task_size_max(void) +static __always_inline unsigned long task_size_max(void) { unsigned long ret; -- cgit v1.2.3 From 7560c02bdffb7c52d1457fa551b9e745d4b9e754 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 May 2021 12:01:20 -0700 Subject: clocksource: Check per-CPU clock synchronization when marked unstable Some sorts of per-CPU clock sources have a history of going out of synchronization with each other. However, this problem has purportedy been solved in the past ten years. Except that it is all too possible that the problem has instead simply been made less likely, which might mean that some of the occasional "Marking clocksource 'tsc' as unstable" messages might be due to desynchronization. How would anyone know? Therefore apply CPU-to-CPU synchronization checking to newly unstable clocksource that are marked with the new CLOCK_SOURCE_VERIFY_PERCPU flag. Lists of desynchronized CPUs are printed, with the caveat that if it is the reporting CPU that is itself desynchronized, it will appear that all the other clocks are wrong. Just like in real life. Reported-by: Chris Mason Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Acked-by: Feng Tang Link: https://lore.kernel.org/r/20210527190124.440372-2-paulmck@kernel.org --- arch/x86/kernel/tsc.c | 3 ++- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57ec01192180..6eb1b097e97e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1152,7 +1152,8 @@ static struct clocksource clocksource_tsc = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_VALID_FOR_HRES | - CLOCK_SOURCE_MUST_VERIFY, + CLOCK_SOURCE_MUST_VERIFY | + CLOCK_SOURCE_VERIFY_PERCPU, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d6ab416ee2d2..7f83d51c0fd7 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -137,7 +137,7 @@ struct clocksource { #define CLOCK_SOURCE_UNSTABLE 0x40 #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 - +#define CLOCK_SOURCE_VERIFY_PERCPU 0x200 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 43243f2be98e..cb12225bf050 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -224,6 +224,60 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) return false; } +static u64 csnow_mid; +static cpumask_t cpus_ahead; +static cpumask_t cpus_behind; + +static void clocksource_verify_one_cpu(void *csin) +{ + struct clocksource *cs = (struct clocksource *)csin; + + csnow_mid = cs->read(cs); +} + +static void clocksource_verify_percpu(struct clocksource *cs) +{ + int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; + u64 csnow_begin, csnow_end; + int cpu, testcpu; + s64 delta; + + cpumask_clear(&cpus_ahead); + cpumask_clear(&cpus_behind); + preempt_disable(); + testcpu = smp_processor_id(); + pr_warn("Checking clocksource %s synchronization from CPU %d.\n", cs->name, testcpu); + for_each_online_cpu(cpu) { + if (cpu == testcpu) + continue; + csnow_begin = cs->read(cs); + smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); + csnow_end = cs->read(cs); + delta = (s64)((csnow_mid - csnow_begin) & cs->mask); + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_behind); + delta = (csnow_end - csnow_mid) & cs->mask; + if (delta < 0) + cpumask_set_cpu(cpu, &cpus_ahead); + delta = clocksource_delta(csnow_end, csnow_begin, cs->mask); + cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + if (cs_nsec > cs_nsec_max) + cs_nsec_max = cs_nsec; + if (cs_nsec < cs_nsec_min) + cs_nsec_min = cs_nsec; + } + preempt_enable(); + if (!cpumask_empty(&cpus_ahead)) + pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_ahead), testcpu, cs->name); + if (!cpumask_empty(&cpus_behind)) + pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", + cpumask_pr_args(&cpus_behind), testcpu, cs->name); + if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) + pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", + testcpu, cs_nsec_min, cs_nsec_max, cs->name); +} + static void clocksource_watchdog(struct timer_list *unused) { u64 csnow, wdnow, cslast, wdlast, delta; @@ -448,6 +502,12 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; + /* Do any required per-CPU skew verification. */ + if (curr_clocksource && + curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && + curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) + clocksource_verify_percpu(curr_clocksource); + spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { -- cgit v1.2.3 From 2e27e793e280ff12cb5c202a1214c08b0d3a0f26 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 May 2021 12:01:22 -0700 Subject: clocksource: Reduce clocksource-skew threshold Currently, WATCHDOG_THRESHOLD is set to detect a 62.5-millisecond skew in a 500-millisecond WATCHDOG_INTERVAL. This requires that clocks be skewed by more than 12.5% in order to be marked unstable. Except that a clock that is skewed by that much is probably destroying unsuspecting software right and left. And given that there are now checks for false-positive skews due to delays between reading the two clocks, it should be possible to greatly decrease WATCHDOG_THRESHOLD, at least for fine-grained clocks such as TSC. Therefore, add a new uncertainty_margin field to the clocksource structure that contains the maximum uncertainty in nanoseconds for the corresponding clock. This field may be initialized manually, as it is for clocksource_tsc_early and clocksource_jiffies, which is copied to refined_jiffies. If the field is not initialized manually, it will be computed at clock-registry time as the period of the clock in question based on the scale and freq parameters to __clocksource_update_freq_scale() function. If either of those two parameters are zero, the tens-of-milliseconds WATCHDOG_THRESHOLD is used as a cowardly alternative to dividing by zero. No matter how the uncertainty_margin field is calculated, it is bounded below by twice WATCHDOG_MAX_SKEW, that is, by 100 microseconds. Note that manually initialized uncertainty_margin fields are not adjusted, but there is a WARN_ON_ONCE() that triggers if any such field is less than twice WATCHDOG_MAX_SKEW. This WARN_ON_ONCE() is intended to discourage production use of the one-nanosecond uncertainty_margin values that are used to test the clock-skew code itself. The actual clock-skew check uses the sum of the uncertainty_margin fields of the two clocksource structures being compared. Integer overflow is avoided because the largest computed value of the uncertainty_margin fields is one billion (10^9), and double that value fits into an unsigned int. However, if someone manually specifies (say) UINT_MAX, they will get what they deserve. Note that the refined_jiffies uncertainty_margin field is initialized to TICK_NSEC, which means that skew checks involving this clocksource will be sufficently forgiving. In a similar vein, the clocksource_tsc_early uncertainty_margin field is initialized to 32*NSEC_PER_MSEC, which replicates the current behavior and allows custom setting if needed in order to address the rare skews detected for this clocksource in current mainline. Suggested-by: Thomas Gleixner Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Acked-by: Feng Tang Link: https://lore.kernel.org/r/20210527190124.440372-4-paulmck@kernel.org --- arch/x86/kernel/tsc.c | 1 + include/linux/clocksource.h | 3 +++ kernel/time/clocksource.c | 48 +++++++++++++++++++++++++++++++++++---------- kernel/time/jiffies.c | 15 +++++++------- 4 files changed, 50 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6eb1b097e97e..2e076a459a0c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs) static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, + .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 7f83d51c0fd7..895203727cb5 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -43,6 +43,8 @@ struct module; * @shift: Cycle to nanosecond divisor (power of two) * @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs) * @maxadj: Maximum adjustment value to mult (~11%) + * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second. + * Zero says to use default WATCHDOG_THRESHOLD. * @archdata: Optional arch-specific data * @max_cycles: Maximum safe cycle value which won't overflow on * multiplication @@ -98,6 +100,7 @@ struct clocksource { u32 shift; u64 max_idle_ns; u32 maxadj; + u32 uncertainty_margin; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e4beab21a1fa..9b27888a6e75 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -95,6 +95,20 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; +/* + * Threshold: 0.0312s, when doubled: 0.0625s. + * Also a default for cs->uncertainty_margin when registering clocks. + */ +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) + +/* + * Maximum permissible delay between two readouts of the watchdog + * clocksource surrounding a read of the clocksource being validated. + * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as + * a lower bound for cs->uncertainty_margin values when registering clocks. + */ +#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC) + #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -121,17 +135,9 @@ static int clocksource_watchdog_kthread(void *data); static void __clocksource_change_rating(struct clocksource *cs, int rating); /* - * Interval: 0.5sec Threshold: 0.0625s + * Interval: 0.5sec. */ #define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) - -/* - * Maximum permissible delay between two readouts of the watchdog - * clocksource surrounding a read of the clocksource being validated. - * This delay could be due to SMIs, NMIs, or to VCPU preemptions. - */ -#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) static void clocksource_watchdog_work(struct work_struct *work) { @@ -348,6 +354,7 @@ static void clocksource_watchdog(struct timer_list *unused) int next_cpu, reset_pending; int64_t wd_nsec, cs_nsec; struct clocksource *cs; + u32 md; spin_lock(&watchdog_lock); if (!watchdog_running) @@ -394,7 +401,8 @@ static void clocksource_watchdog(struct timer_list *unused) continue; /* Check the deviation from the watchdog clocksource. */ - if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + md = cs->uncertainty_margin + watchdog->uncertainty_margin; + if (abs(cs_nsec - wd_nsec) > md) { pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", smp_processor_id(), cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", @@ -1047,6 +1055,26 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); } + + /* + * If the uncertainty margin is not specified, calculate it. + * If both scale and freq are non-zero, calculate the clock + * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, + * if either of scale or freq is zero, be very conservative and + * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the + * uncertainty margin. Allow stupidly small uncertainty margins + * to be specified by the caller for testing purposes, but warn + * to discourage production use of this capability. + */ + if (scale && freq && !cs->uncertainty_margin) { + cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); + if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) + cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; + } else if (!cs->uncertainty_margin) { + cs->uncertainty_margin = WATCHDOG_THRESHOLD; + } + WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); + /* * Ensure clocksources that have large 'mult' values don't overflow * when adjusted. diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a492e4da69ba..01935aafdb46 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -49,13 +49,14 @@ static u64 jiffies_read(struct clocksource *cs) * for "tick-less" systems. */ static struct clocksource clocksource_jiffies = { - .name = "jiffies", - .rating = 1, /* lowest valid rating*/ - .read = jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .uncertainty_margin = 32 * NSEC_PER_MSEC, + .read = jiffies_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .max_cycles = 10, }; __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); -- cgit v1.2.3 From 51c2ee6d121ceb31ab8d35aff4ce53007aefb455 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 21 Jun 2021 16:18:22 -0700 Subject: Kconfig: Introduce ARCH_WANTS_NO_INSTR and CC_HAS_NO_PROFILE_FN_ATTR We don't want compiler instrumentation to touch noinstr functions, which are annotated with the no_profile_instrument_function function attribute. Add a Kconfig test for this and make GCOV depend on it, and in the future, PGO. If an architecture is using noinstr, it should denote that via this Kconfig value. That makes Kconfigs that depend on noinstr able to express dependencies in an architecturally agnostic way. Cc: Masahiro Yamada Link: https://lore.kernel.org/lkml/YMTn9yjuemKFLbws@hirez.programming.kicks-ass.net/ Link: https://lore.kernel.org/lkml/YMcssV%2Fn5IBGv4f0@hirez.programming.kicks-ass.net/ Suggested-by: Nathan Chancellor Suggested-by: Peter Zijlstra Signed-off-by: Nick Desaulniers Reviewed-by: Peter Oberparleiter Reviewed-by: Nathan Chancellor Acked-by: Mark Rutland Acked-by: Heiko Carstens Acked-by: Peter Zijlstra (Intel) Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210621231822.2848305-4-ndesaulniers@google.com --- arch/Kconfig | 7 +++++++ arch/arm64/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/x86/Kconfig | 1 + init/Kconfig | 3 +++ kernel/gcov/Kconfig | 1 + 6 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index c45b770d3579..129df498a8e1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -285,6 +285,13 @@ config ARCH_THREAD_STACK_ALLOCATOR config ARCH_WANTS_DYNAMIC_TASK_STRUCT bool +config ARCH_WANTS_NO_INSTR + bool + help + An architecture should select this if the noinstr macro is being used on + functions to denote that the toolchain should avoid instrumenting such + functions and is required for correctness. + config ARCH_32BIT_OFF_T bool depends on !64BIT diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f1d8566bbf9..39bf982b06f8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -93,6 +93,7 @@ config ARM64 select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANTS_NO_INSTR select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b4c7c34069f8..bd60310f33b9 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -117,6 +117,7 @@ config S390 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 12fa0d7bfa64..268a5dec8b3f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -113,6 +113,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_NO_INSTR select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 diff --git a/init/Kconfig b/init/Kconfig index 1ea12c64e4c9..31397a7a45fb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -83,6 +83,9 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_NO_PROFILE_FN_ATTR + def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) + config CONSTRUCTORS bool diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 58f87a3092f3..053447183ac5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -5,6 +5,7 @@ config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 + depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR select CONSTRUCTORS default n help -- cgit v1.2.3 From 310f134ed41fcaa03eff302b1e69f1ce1ee21841 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 23 Jun 2021 08:40:00 +0200 Subject: x86/sev: Add defines for GHCB version 2 MSR protocol requests Add the necessary defines for supporting the GHCB version 2 protocol. This includes defines for: - MSR-based AP hlt request/response - Hypervisor Feature request/response This is the bare minimum of requests that need to be supported by a GHCB version 2 implementation. There are more requests in the specification, but those depend on Secure Nested Paging support being available. These defines are shared between SEV host and guest support. [ bp: Fold in https://lkml.kernel.org/r/20210622144825.27588-2-joro@8bytes.org too. Simplify the brewing macro maze into readability. ] Co-developed-by: Tom Lendacky Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/YNLXQIZ5e1wjkshG@8bytes.org --- arch/x86/include/asm/sev-common.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 629c3df243f0..2cef6c5a52c2 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -9,8 +9,13 @@ #define __ASM_X86_SEV_COMMON_H #define GHCB_MSR_INFO_POS 0 -#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1) +#define GHCB_DATA_LOW 12 +#define GHCB_MSR_INFO_MASK (BIT_ULL(GHCB_DATA_LOW) - 1) +#define GHCB_DATA(v) \ + (((unsigned long)(v) & ~GHCB_MSR_INFO_MASK) >> GHCB_DATA_LOW) + +/* SEV Information Request/Response */ #define GHCB_MSR_SEV_INFO_RESP 0x001 #define GHCB_MSR_SEV_INFO_REQ 0x002 #define GHCB_MSR_VER_MAX_POS 48 @@ -28,6 +33,7 @@ #define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK) #define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK) +/* CPUID Request/Response */ #define GHCB_MSR_CPUID_REQ 0x004 #define GHCB_MSR_CPUID_RESP 0x005 #define GHCB_MSR_CPUID_FUNC_POS 32 @@ -45,6 +51,14 @@ (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \ (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS)) +/* AP Reset Hold */ +#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 +#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 + +/* GHCB Hypervisor Feature Request/Response */ +#define GHCB_MSR_HV_FT_REQ 0x080 +#define GHCB_MSR_HV_FT_RESP 0x081 + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf -- cgit v1.2.3 From 8d9d46bbf3b6b7ff8edcac33603ab45c29e0e07f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 22 Jun 2021 16:48:25 +0200 Subject: x86/sev: Use "SEV: " prefix for messages from sev.c The source file has been renamed froms sev-es.c to sev.c, but the messages are still prefixed with "SEV-ES: ". Change that to "SEV: " to make it consistent. Fixes: e759959fe3b8 ("x86/sev-es: Rename sev-es.{ch} to sev.{ch}") Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210622144825.27588-4-joro@8bytes.org --- arch/x86/kernel/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 87a4b00f028e..a6895e440bc3 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -7,7 +7,7 @@ * Author: Joerg Roedel */ -#define pr_fmt(fmt) "SEV-ES: " fmt +#define pr_fmt(fmt) "SEV: " fmt #include /* For show_regs() */ #include -- cgit v1.2.3 From 9625895011d130033d1bc7aac0d77a9bf68ff8a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:28 +0200 Subject: x86/fpu: Fix copy_xstate_to_kernel() gap handling The gap handling in copy_xstate_to_kernel() is wrong when XSAVES is in use. Using init_fpstate for copying the init state of features which are not set in the xstate header is only correct for the legacy area, but not for the extended features area because when XSAVES is in use then init_fpstate is in compacted form which means the xstate offsets which are used to copy from init_fpstate are not valid. Fortunately, this is not a real problem today because all extended features in use have an all-zeros init state, but it is wrong nevertheless and with a potentially dynamically sized init_fpstate this would result in an access outside of the init_fpstate. Fix this by keeping track of the last copied state in the target buffer and explicitly zero it when there is a feature or alignment gap. Use the compacted offset when accessing the extended feature space in init_fpstate. As this is not a functional issue on older kernels this is intentionally not tagged for stable. Fixes: b8be15d58806 ("x86/fpu/xstate: Re-enable XSAVES") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121451.294282032@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 105 +++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f0f64d42204d..d822ce76f0a1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1084,20 +1084,10 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) return true; } -static void fill_gap(struct membuf *to, unsigned *last, unsigned offset) +static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, + void *init_xstate, unsigned int size) { - if (*last >= offset) - return; - membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last); - *last = offset; -} - -static void copy_part(struct membuf *to, unsigned *last, unsigned offset, - unsigned size, void *from) -{ - fill_gap(to, last, offset); - membuf_write(to, from, size); - *last = offset + size; + membuf_write(to, from_xstate ? xstate : init_xstate, size); } /* @@ -1109,10 +1099,10 @@ static void copy_part(struct membuf *to, unsigned *last, unsigned offset, */ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) { + const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); + struct xregs_state *xinit = &init_fpstate.xsave; struct xstate_header header; - const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr); - unsigned size = to.left; - unsigned last = 0; + unsigned int zerofrom; int i; /* @@ -1122,41 +1112,68 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) header.xfeatures = xsave->header.xfeatures; header.xfeatures &= xfeatures_mask_user(); - if (header.xfeatures & XFEATURE_MASK_FP) - copy_part(&to, &last, 0, off_mxcsr, &xsave->i387); - if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)) - copy_part(&to, &last, off_mxcsr, - MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr); - if (header.xfeatures & XFEATURE_MASK_FP) - copy_part(&to, &last, offsetof(struct fxregs_state, st_space), - 128, &xsave->i387.st_space); - if (header.xfeatures & XFEATURE_MASK_SSE) - copy_part(&to, &last, xstate_offsets[XFEATURE_SSE], - 256, &xsave->i387.xmm_space); - /* - * Fill xsave->i387.sw_reserved value for ptrace frame: - */ - copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved), - 48, xstate_fx_sw_bytes); - /* - * Copy xregs_state->header: - */ - copy_part(&to, &last, offsetof(struct xregs_state, header), - sizeof(header), &header); + /* Copy FP state up to MXCSR */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, + &xinit->i387, off_mxcsr); + + /* Copy MXCSR when SSE or YMM are set in the feature mask */ + copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), + &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, + MXCSR_AND_FLAGS_SIZE); + + /* Copy the remaining FP state */ + copy_feature(header.xfeatures & XFEATURE_MASK_FP, + &to, &xsave->i387.st_space, &xinit->i387.st_space, + sizeof(xsave->i387.st_space)); + + /* Copy the SSE state - shared with YMM, but independently managed */ + copy_feature(header.xfeatures & XFEATURE_MASK_SSE, + &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, + sizeof(xsave->i387.xmm_space)); + + /* Zero the padding area */ + membuf_zero(&to, sizeof(xsave->i387.padding)); + + /* Copy xsave->i387.sw_reserved */ + membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); + + /* Copy the user space relevant state of @xsave->header */ + membuf_write(&to, &header, sizeof(header)); + + zerofrom = offsetof(struct xregs_state, extended_state_area); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { /* - * Copy only in-use xstates: + * The ptrace buffer is in non-compacted XSAVE format. + * In non-compacted format disabled features still occupy + * state space, but there is no state to copy from in the + * compacted init_fpstate. The gap tracking will zero this + * later. */ - if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, i); + if (!(xfeatures_mask_user() & BIT_ULL(i))) + continue; - copy_part(&to, &last, xstate_offsets[i], - xstate_sizes[i], src); - } + /* + * If there was a feature or alignment gap, zero the space + * in the destination buffer. + */ + if (zerofrom < xstate_offsets[i]) + membuf_zero(&to, xstate_offsets[i] - zerofrom); + + copy_feature(header.xfeatures & BIT_ULL(i), &to, + __raw_xsave_addr(xsave, i), + __raw_xsave_addr(xinit, i), + xstate_sizes[i]); + /* + * Keep track of the last copied state in the non-compacted + * target buffer for gap zeroing. + */ + zerofrom = xstate_offsets[i] + xstate_sizes[i]; } - fill_gap(&to, &last, size); + + if (to.left) + membuf_zero(&to, to.left); } /* -- cgit v1.2.3 From b3607269ff57fd3c9690cb25962c5e4b91a0fd3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:29 +0200 Subject: x86/pkeys: Revert a5eff7259790 ("x86/pkeys: Add PKRU value to init_fpstate") This cannot work and it's unclear how that ever made a difference. init_fpstate.xsave.header.xfeatures is always 0 so get_xsave_addr() will always return a NULL pointer, which will prevent storing the default PKRU value in init_fpstate. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121451.451391598@linutronix.de --- arch/x86/kernel/cpu/common.c | 5 ----- arch/x86/mm/pkeys.c | 6 ------ 2 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9173dd475818..dcbb11fda5d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,8 +466,6 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - struct pkru_state *pk; - /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -478,9 +476,6 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) return; cr4_set_bits(X86_CR4_PKE); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (pk) - pk->pkru = init_pkru_value; /* * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index a2332eef66e9..a840ac78b6de 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -10,7 +10,6 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ -#include /* init_fpstate */ int __execute_only_pkey(struct mm_struct *mm) { @@ -154,7 +153,6 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - struct pkru_state *pk; char buf[32]; ssize_t len; u32 new_init_pkru; @@ -177,10 +175,6 @@ static ssize_t init_pkru_write_file(struct file *file, return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); - pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); - if (!pk) - return -EINVAL; - pk->pkru = new_init_pkru; return count; } -- cgit v1.2.3 From ce578f16348b003675c928a1992498b33b515f18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:30 +0200 Subject: x86/fpu: Mark various FPU state variables __ro_after_init Nothing modifies these after booting. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Reviewed-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20210623121451.611751529@linutronix.de --- arch/x86/kernel/fpu/init.c | 4 ++-- arch/x86/kernel/fpu/xstate.c | 14 +++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 701f196d7c68..95aa1090ea72 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c) /* * Boot time FPU feature detection code: */ -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) @@ -135,7 +135,7 @@ static void __init fpu__init_system_generic(void) * This is inherent to the XSAVE architecture which puts all state * components into a single, continuous memory block: */ -unsigned int fpu_kernel_xstate_size; +unsigned int fpu_kernel_xstate_size __ro_after_init; EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d822ce76f0a1..bcb9f5648cfd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -61,17 +61,21 @@ static short xsave_cpuid_features[] __initdata = { */ u64 xfeatures_mask_all __read_mostly; -static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; * it is always in standard format for user mode. This is the user * mode standard format size used for signal and ptrace frames. */ -unsigned int fpu_user_xstate_size; +unsigned int fpu_user_xstate_size __ro_after_init; /* * Return whether the system supports a given xfeature. -- cgit v1.2.3 From 4e8e4313cf81add679e1c57677d689c02e382a67 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:31 +0200 Subject: x86/fpu: Make xfeatures_mask_all __ro_after_init Nothing has to modify this after init. But of course there is code which unconditionally masks xfeatures_mask_all on CPU hotplug. This goes unnoticed during boot hotplug because at that point the variable is still RW mapped. This is broken in several ways: 1) Masking this in post init CPU hotplug means that any modification of this state goes unnoticed until actual hotplug happens. 2) If that ever happens then these bogus feature bits are already populated all over the place and the system is in inconsistent state vs. the compacted XSTATE offsets. If at all then this has to panic the machine because the inconsistency cannot be undone anymore. Make this a one-time paranoia check in xstate init code and disable xsave when this happens. Reported-by: Kan Liang Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121451.712803952@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bcb9f5648cfd..a64f61ae8540 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -59,7 +59,7 @@ static short xsave_cpuid_features[] __initdata = { * This represents the full set of bits that should ever be set in a kernel * XSAVE buffer, both supervisor and user xstates. */ -u64 xfeatures_mask_all __read_mostly; +u64 xfeatures_mask_all __ro_after_init; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; @@ -213,19 +213,8 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - u64 unsup_bits; - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) return; - /* - * Unsupported supervisor xstates should not be found in - * the xfeatures mask. - */ - unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; - WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", - unsup_bits); - - xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); @@ -825,6 +814,7 @@ void __init fpu__init_system_xstate(void) { unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; + u64 xfeatures; int err; int i; @@ -879,6 +869,8 @@ void __init fpu__init_system_xstate(void) } xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); + /* Store it for paranoia check at the end */ + xfeatures = xfeatures_mask_all; /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -896,8 +888,18 @@ void __init fpu__init_system_xstate(void) setup_init_fpu_buf(); setup_xstate_comp_offsets(); setup_supervisor_only_offsets(); - print_xstate_offset_size(); + /* + * Paranoia check whether something in the setup modified the + * xfeatures mask. + */ + if (xfeatures != xfeatures_mask_all) { + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + xfeatures, xfeatures_mask_all); + goto out_disable; + } + + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask_all, fpu_kernel_xstate_size, -- cgit v1.2.3 From ce38f038ede735fd425ebda10d1758420a669a87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:32 +0200 Subject: x86/fpu: Get rid of fpu__get_supported_xfeatures_mask() This function is really not doing what the comment advertises: "Find supported xfeatures based on cpu features and command-line input. This must be called after fpu__init_parse_early_param() is called and xfeatures_mask is enumerated." fpu__init_parse_early_param() does not exist anymore and the function just returns a constant. Remove it and fix the caller and get rid of further references to fpu__init_parse_early_param(). Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121451.816404717@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 1 - arch/x86/kernel/cpu/common.c | 5 ++--- arch/x86/kernel/fpu/init.c | 11 ----------- arch/x86/kernel/fpu/xstate.c | 4 +++- 4 files changed, 5 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 16bf4d4a8159..58f20821662f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -45,7 +45,6 @@ extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); -extern u64 fpu__get_supported_xfeatures_mask(void); /* * Debugging facility: diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dcbb11fda5d2..f875dea49d7e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1715,9 +1715,8 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in fpu__init_parse_early_param. - * But we need to keep a dummy __setup around otherwise it would - * show up as an environment variable for init. + * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy + * function prevents it from becoming an environment variable for init. */ static __init int setup_clearcpuid(char *arg) { diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 95aa1090ea72..64e29927cc32 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -216,17 +216,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_user_xstate_size = fpu_kernel_xstate_size; } -/* - * Find supported xfeatures based on cpu features and command-line input. - * This must be called after fpu__init_parse_early_param() is called and - * xfeatures_mask is enumerated. - */ -u64 __init fpu__get_supported_xfeatures_mask(void) -{ - return XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; -} - /* Legacy code to initialize eager fpu mode. */ static void __init fpu__init_system_ctx_switch(void) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a64f61ae8540..5e825ff0bfdc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -868,7 +868,9 @@ void __init fpu__init_system_xstate(void) xfeatures_mask_all &= ~BIT_ULL(i); } - xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); + xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; + /* Store it for paranoia check at the end */ xfeatures = xfeatures_mask_all; -- cgit v1.2.3 From 4098b3eef37be19572d270f9b761c3e8ffcf37ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:33 +0200 Subject: x86/fpu: Remove unused get_xsave_field_ptr() Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121451.915614415@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/xstate.c | 30 ------------------------------ 2 files changed, 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 47a92232d595..d22e973845c6 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -101,7 +101,6 @@ extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); -const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); int xfeature_size(int xfeature_nr); struct membuf; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 5e825ff0bfdc..b82879cd9151 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -998,36 +998,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) } EXPORT_SYMBOL_GPL(get_xsave_addr); -/* - * This wraps up the common operations that need to occur when retrieving - * data from xsave state. It first ensures that the current task was - * using the FPU and retrieves the data in to a buffer. It then calculates - * the offset of the requested field in the buffer. - * - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, - * XFEATURE_SSE, etc...) - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -const void *get_xsave_field_ptr(int xfeature_nr) -{ - struct fpu *fpu = ¤t->thread.fpu; - - /* - * fpu__save() takes the CPU's xstate registers - * and saves them off to the 'fpu memory buffer. - */ - fpu__save(fpu); - - return get_xsave_addr(&fpu->state.xsave, xfeature_nr); -} - #ifdef CONFIG_ARCH_HAS_PKEYS /* -- cgit v1.2.3 From e68524456c855e500f0a636adb1aa977e1e0b4d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:34 +0200 Subject: x86/fpu: Move inlines where they belong They are only used in fpstate_init() and there is no point to have them in a header just to make reading the code harder. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.023118522@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 14 -------------- arch/x86/kernel/fpu/core.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 58f20821662f..9ed984609595 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -86,20 +86,6 @@ extern void fpstate_init_soft(struct swregs_state *soft); static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif -static inline void fpstate_init_xstate(struct xregs_state *xsave) -{ - /* - * XRSTORS requires these bits set in xcomp_bv, or it will - * trigger #GP: - */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; -} - -static inline void fpstate_init_fxstate(struct fxregs_state *fx) -{ - fx->cwd = 0x37f; - fx->mxcsr = MXCSR_DEFAULT; -} extern void fpstate_sanitize_xstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 571220ac8bea..9ff467bb4894 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -181,6 +181,21 @@ void fpu__save(struct fpu *fpu) fpregs_unlock(); } +static inline void fpstate_init_xstate(struct xregs_state *xsave) +{ + /* + * XRSTORS requires these bits set in xcomp_bv, or it will + * trigger #GP: + */ + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; +} + +static inline void fpstate_init_fxstate(struct fxregs_state *fx) +{ + fx->cwd = 0x37f; + fx->mxcsr = MXCSR_DEFAULT; +} + /* * Legacy x87 fpstate state init: */ -- cgit v1.2.3 From 07d6688b22e09be465652cf2da0da6bf86154df6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:35 +0200 Subject: x86/fpu: Limit xstate copy size in xstateregs_set() If the count argument is larger than the xstate size, this will happily copy beyond the end of xstate. Fixes: 91c3dba7dbc1 ("x86/fpu/xstate: Fix PTRACE frames for XSAVES") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.120741557@linutronix.de --- arch/x86/kernel/fpu/regset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index c413756ba89f..6bb874441de8 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -117,7 +117,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, /* * A whole standard-format XSAVE buffer is needed: */ - if ((pos != 0) || (count < fpu_user_xstate_size)) + if (pos != 0 || count != fpu_user_xstate_size) return -EFAULT; xsave = &fpu->state.xsave; -- cgit v1.2.3 From 43be46e89698a41dbf4fff81a322f4c2ae21b5e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:36 +0200 Subject: x86/fpu: Sanitize xstateregs_set() xstateregs_set() operates on a stopped task and tries to copy the provided buffer into the task's fpu.state.xsave buffer. Any error while copying or invalid state detected after copying results in wiping the target task's FPU state completely including supervisor states. That's just wrong. The caller supplied invalid data or has a problem with unmapped memory, so there is absolutely no justification to corrupt the target state. Fix this with the following modifications: 1) If data has to be copied from userspace, allocate a buffer and copy from user first. 2) Use copy_kernel_to_xstate() unconditionally so that header checking works correctly. 3) Return on error without corrupting the target state. This prevents corrupting states and lets the caller deal with the problem it caused in the first place. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.214903673@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 4 ---- arch/x86/kernel/fpu/regset.c | 42 ++++++++++++++++----------------------- arch/x86/kernel/fpu/xstate.c | 14 +++++++------ 3 files changed, 25 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index d22e973845c6..1bb2d16f485b 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -111,8 +111,4 @@ void copy_supervisor_to_kernel(struct xregs_state *xsave); void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); - -/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_user_xstate_header(const struct xstate_header *hdr); - #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6bb874441de8..a50c0a935499 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -2,11 +2,13 @@ /* * FPU register's regset abstraction, for ptrace, core dumps, etc. */ +#include +#include + #include #include #include #include -#include /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -108,10 +110,10 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; - struct xregs_state *xsave; + struct xregs_state *tmpbuf = NULL; int ret; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; /* @@ -120,32 +122,22 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (pos != 0 || count != fpu_user_xstate_size) return -EFAULT; - xsave = &fpu->state.xsave; - - fpu__prepare_write(fpu); + if (!kbuf) { + tmpbuf = vmalloc(count); + if (!tmpbuf) + return -ENOMEM; - if (using_compacted_format()) { - if (kbuf) - ret = copy_kernel_to_xstate(xsave, kbuf); - else - ret = copy_user_to_xstate(xsave, ubuf); - } else { - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); - if (!ret) - ret = validate_user_xstate_header(&xsave->header); + if (copy_from_user(tmpbuf, ubuf, count)) { + ret = -EFAULT; + goto out; + } } - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - /* - * In case of failure, mark all states as init: - */ - if (ret) - fpstate_init(&fpu->state); + fpu__prepare_write(fpu); + ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); +out: + vfree(tmpbuf); return ret; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index b82879cd9151..2b7b579f06b6 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -543,7 +543,7 @@ int using_compacted_format(void) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_user_xstate_header(const struct xstate_header *hdr) +static int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~xfeatures_mask_user()) @@ -1155,7 +1155,7 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) } /* - * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format + * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format * and copy to the target thread. This is called from xstateregs_set(). */ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) @@ -1202,14 +1202,16 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) */ xsave->header.xfeatures |= hdr.xfeatures; + /* mxcsr reserved bits must be masked to zero for historical reasons. */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + return 0; } /* - * Convert from a ptrace or sigreturn standard-format user-space buffer to - * kernel XSAVES format and copy to the target thread. This is called from - * xstateregs_set(), as well as potentially from the sigreturn() and - * rt_sigreturn() system calls. + * Convert from a sigreturn standard-format user-space buffer to kernel + * XSAVE[S] format and copy to the target thread. This is called from the + * sigreturn() and rt_sigreturn() system calls. */ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { -- cgit v1.2.3 From 947f4947cf00ea1e6d319eb182c64ea51ba4de8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:37 +0200 Subject: x86/fpu: Reject invalid MXCSR values in copy_kernel_to_xstate() Instead of masking out reserved bits, check them and reject the provided state as invalid if not zero. Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.308388343@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2b7b579f06b6..9cf84c5d1d37 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1154,6 +1154,19 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) membuf_zero(&to, to.left); } +static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr) +{ + u64 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; + + /* Only check if it is in use */ + if (hdr->xfeatures & mask) { + /* Reserved bits in MXCSR must be zero. */ + if (*mxcsr & ~mxcsr_feature_mask) + return false; + } + return true; +} + /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format * and copy to the target thread. This is called from xstateregs_set(). @@ -1172,6 +1185,9 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) if (validate_user_xstate_header(&hdr)) return -EINVAL; + if (!mxcsr_valid(&hdr, kbuf + offsetof(struct fxregs_state, mxcsr))) + return -EINVAL; + for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); @@ -1202,9 +1218,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) */ xsave->header.xfeatures |= hdr.xfeatures; - /* mxcsr reserved bits must be masked to zero for historical reasons. */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - return 0; } -- cgit v1.2.3 From 3a3351126ee8f1f1c86c4c79c60a650c1da89733 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 23 Jun 2021 14:01:38 +0200 Subject: x86/fpu: Simplify PTRACE_GETREGS code ptrace() has interfaces that let a ptracer inspect a ptracee's register state. This includes XSAVE state. The ptrace() ABI includes a hardware-format XSAVE buffer for both the SETREGS and GETREGS interfaces. In the old days, the kernel buffer and the ptrace() ABI buffer were the same boring non-compacted format. But, since the advent of supervisor states and the compacted format, the kernel buffer has diverged from the format presented in the ABI. This leads to two paths in the kernel: 1. Effectively a verbatim copy_to_user() which just copies the kernel buffer out to userspace. This is used when the kernel buffer is kept in the non-compacted form which means that it shares a format with the ptrace ABI. 2. A one-state-at-a-time path: copy_xstate_to_kernel(). This is theoretically slower since it does a bunch of piecemeal copies. Remove the verbatim copy case. Speed probably does not matter in this path, and the vast majority of new hardware will use the one-state-at-a-time path anyway. This ensures greater testing for the "slow" path. This also makes enabling PKRU in this interface easier since a single path can be patched instead of two. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.408457100@linutronix.de --- arch/x86/kernel/fpu/regset.c | 24 +++--------------------- arch/x86/kernel/fpu/xstate.c | 6 +++--- 2 files changed, 6 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index a50c0a935499..d60e77d39222 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -77,32 +77,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct fpu *fpu = &target->thread.fpu; - struct xregs_state *xsave; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) + if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - xsave = &fpu->state.xsave; - fpu__prepare_read(fpu); - if (using_compacted_format()) { - copy_xstate_to_kernel(to, xsave); - return 0; - } else { - fpstate_sanitize_xstate(fpu); - /* - * Copy the 48 bytes defined by the software into the xsave - * area in the thread struct, so that we can copy the whole - * area to user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - - /* - * Copy the xstate memory layout. - */ - return membuf_write(&to, xsave, fpu_user_xstate_size); - } + copy_xstate_to_kernel(to, &fpu->state.xsave); + return 0; } int xstateregs_set(struct task_struct *target, const struct user_regset *regset, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9cf84c5d1d37..42032471c957 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1069,11 +1069,11 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, } /* - * Convert from kernel XSAVES compacted format to standard format and copy - * to a kernel-space ptrace buffer. + * Convert from kernel XSAVE or XSAVES compacted format to UABI + * non-compacted format and copy to a kernel-space ptrace buffer. * * It supports partial copy but pos always starts from zero. This is called - * from xstateregs_get() and there we check the CPU has XSAVES. + * from xstateregs_get() and there we check the CPU has XSAVE. */ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) { -- cgit v1.2.3 From 6164331d15f7d912fb9369245368e9564ea49813 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 23 Jun 2021 14:01:39 +0200 Subject: x86/fpu: Rewrite xfpregs_set() xfpregs_set() was incomprehensible. Almost all of the complexity was due to trying to support nonsensically sized writes or -EFAULT errors that would have partially or completely overwritten the destination before failing. Nonsensically sized input would only have been possible using PTRACE_SETREGSET on REGSET_XFP. Fortunately, it appears (based on Debian code search results) that no one uses that API at all, let alone with the wrong sized buffer. Failed user access can be handled more cleanly by first copying to kernel memory. Just rewrite it to require sensible input. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.504234607@linutronix.de --- arch/x86/kernel/fpu/regset.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d60e77d39222..f24ce873bfc2 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -47,30 +47,39 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, const void *kbuf, const void __user *ubuf) { struct fpu *fpu = &target->thread.fpu; + struct user32_fxsr_struct newstate; int ret; - if (!boot_cpu_has(X86_FEATURE_FXSR)) + BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state)); + + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(newstate)) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1); + if (ret) + return ret; + + /* Mask invalid MXCSR bits (for historical reasons). */ + newstate.mxcsr &= mxcsr_feature_mask; + fpu__prepare_write(fpu); - fpstate_sanitize_xstate(fpu); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); + /* Copy the state */ + memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); - /* - * mxcsr reserved bits must be masked to zero for security reasons. - */ - fpu->state.fxsave.mxcsr &= mxcsr_feature_mask; + /* Clear xmm8..15 */ + BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16); + memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16); - /* - * update the header bits in the xsave header, indicating the - * presence of FP and SSE state. - */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + /* Mark FP and SSE as in use when XSAVE is enabled */ + if (use_xsave()) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; - return ret; + return 0; } int xstateregs_get(struct task_struct *target, const struct user_regset *regset, -- cgit v1.2.3 From 145e9e0d8c6fada4a40f9fc65b34658077874d9c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 23 Jun 2021 14:01:40 +0200 Subject: x86/fpu: Fail ptrace() requests that try to set invalid MXCSR values There is no benefit from accepting and silently changing an invalid MXCSR value supplied via ptrace(). Instead, return -EINVAL on invalid input. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.613614842@linutronix.de --- arch/x86/kernel/fpu/regset.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index f24ce873bfc2..5610f77cacad 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -63,8 +63,9 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - /* Mask invalid MXCSR bits (for historical reasons). */ - newstate.mxcsr &= mxcsr_feature_mask; + /* Do not allow an invalid MXCSR value. */ + if (newstate.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; fpu__prepare_write(fpu); -- cgit v1.2.3 From da53f60bb86e60830932926cf1093953a811912c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 23 Jun 2021 14:01:41 +0200 Subject: x86/fpu: Clean up fpregs_set() fpregs_set() has unnecessary complexity to support short or nonzero-offset writes and to handle the case in which a copy from userspace overwrites some of the target buffer and then fails. Support for partial writes is useless -- just require that the write has offset 0 and the correct size, and copy into a temporary kernel buffer to avoid clobbering the state if the user access fails. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.710467587@linutronix.de --- arch/x86/kernel/fpu/regset.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 5610f77cacad..7041b140de60 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -304,31 +304,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - fpu__prepare_write(fpu); - fpstate_sanitize_xstate(fpu); + /* No funny business with partial or oversized writes is permitted. */ + if (pos != 0 || count != sizeof(struct user_i387_ia32_struct)) + return -EINVAL; - if (!boot_cpu_has(X86_FEATURE_FPU)) + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - if (!boot_cpu_has(X86_FEATURE_FXSR)) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (ret) + return ret; - if (pos > 0 || count < sizeof(env)) - convert_from_fxsr(&env, target); + fpu__prepare_write(fpu); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); - if (!ret) - convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); + if (cpu_feature_enabled(X86_FEATURE_FXSR)) + convert_to_fxsr(&fpu->state.fxsave, &env); + else + memcpy(&fpu->state.fsave, &env, sizeof(env)); /* - * update the header bit in the xsave header, indicating the + * Update the header bit in the xsave header, indicating the * presence of FP. */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP; - return ret; + + return 0; } #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ -- cgit v1.2.3 From eb6f51723f03c9a1c098ed196a31a03e626b9fb6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:42 +0200 Subject: x86/fpu: Make copy_xstate_to_kernel() usable for [x]fpregs_get() When xsave with init state optimization is used then a component's state in the task's xsave buffer can be stale when the corresponding feature bit is not set. fpregs_get() and xfpregs_get() invoke fpstate_sanitize_xstate() to update the task's xsave buffer before retrieving the FX or FP state. That's just duplicated code as copy_xstate_to_kernel() already handles this correctly. Add a copy mode argument to the function which allows to restrict the state copy to the FP and SSE features. Also rename the function to copy_xstate_to_uabi_buf() so the name reflects what it is doing. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.805327286@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 12 +++++++++-- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/xstate.c | 42 +++++++++++++++++++++++++++++---------- 3 files changed, 42 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 1bb2d16f485b..732ae793c2ab 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -103,12 +103,20 @@ extern void __init update_regset_xstate_info(unsigned int size, void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int using_compacted_format(void); int xfeature_size(int xfeature_nr); -struct membuf; -void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void copy_supervisor_to_kernel(struct xregs_state *xsave); void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); +enum xstate_copy_mode { + XSTATE_COPY_FP, + XSTATE_COPY_FX, + XSTATE_COPY_XSAVE, +}; + +struct membuf; +void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, + enum xstate_copy_mode mode); + #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 7041b140de60..783f84dfcd46 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -93,7 +93,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__prepare_read(fpu); - copy_xstate_to_kernel(to, &fpu->state.xsave); + copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE); return 0; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 42032471c957..8d023a26ae22 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1068,14 +1068,20 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, membuf_write(to, from_xstate ? xstate : init_xstate, size); } -/* - * Convert from kernel XSAVE or XSAVES compacted format to UABI - * non-compacted format and copy to a kernel-space ptrace buffer. +/** + * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer + * @to: membuf descriptor + * @xsave: The kernel xstate buffer to copy from + * @copy_mode: The requested copy mode + * + * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming + * format, i.e. from the kernel internal hardware dependent storage format + * to the requested @mode. UABI XSTATE is always uncompacted! * - * It supports partial copy but pos always starts from zero. This is called - * from xstateregs_get() and there we check the CPU has XSAVE. + * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) +void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, + enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.xsave; @@ -1083,12 +1089,22 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) unsigned int zerofrom; int i; - /* - * The destination is a ptrace buffer; we put in only user xstates: - */ - memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= xfeatures_mask_user(); + + /* Mask out the feature bits depending on copy mode */ + switch (copy_mode) { + case XSTATE_COPY_FP: + header.xfeatures &= XFEATURE_MASK_FP; + break; + + case XSTATE_COPY_FX: + header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; + break; + + case XSTATE_COPY_XSAVE: + header.xfeatures &= xfeatures_mask_user(); + break; + } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, @@ -1109,6 +1125,9 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); + if (copy_mode != XSTATE_COPY_XSAVE) + goto out; + /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); @@ -1150,6 +1169,7 @@ void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) zerofrom = xstate_offsets[i] + xstate_sizes[i]; } +out: if (to.left) membuf_zero(&to, to.left); } -- cgit v1.2.3 From adc997b3d66d1cfa8c15a7dbafdaef239a51b5db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:43 +0200 Subject: x86/fpu: Use copy_xstate_to_uabi_buf() in xfpregs_get() Use the new functionality of copy_xstate_to_uabi_buf() to retrieve the FX state when XSAVE* is in use. This avoids overwriting the FPU state buffer with fpstate_sanitize_xstate() which is error prone and duplicated code. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121452.901736860@linutronix.de --- arch/x86/kernel/fpu/regset.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 783f84dfcd46..ccbe25f6627d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -33,13 +33,18 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; - if (!boot_cpu_has(X86_FEATURE_FXSR)) + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; fpu__prepare_read(fpu); - fpstate_sanitize_xstate(fpu); - return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state)); + if (!use_xsave()) { + return membuf_write(&to, &fpu->state.fxsave, + sizeof(fpu->state.fxsave)); + } + + copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_FX); + return 0; } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, -- cgit v1.2.3 From 3f7f75634ccefefcc929696f346db7a748e78f79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:44 +0200 Subject: x86/fpu: Use copy_xstate_to_uabi_buf() in fpregs_get() Use the new functionality of copy_xstate_to_uabi_buf() to retrieve the FX state when XSAVE* is in use. This avoids to overwrite the FPU state buffer with fpstate_sanitize_xstate() which is error prone and duplicated code. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.014441775@linutronix.de --- arch/x86/kernel/fpu/regset.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ccbe25f6627d..4b799e42ebc7 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -210,10 +210,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave) * FXSR floating point environment conversions. */ -void -convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +static void __convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk, + struct fxregs_state *fxsave) { - struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -247,6 +247,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave); +} + void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) @@ -279,25 +285,29 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, { struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; + struct fxregs_state fxsave, *fx; fpu__prepare_read(fpu); - if (!boot_cpu_has(X86_FEATURE_FPU)) + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, to); - if (!boot_cpu_has(X86_FEATURE_FXSR)) { + if (!cpu_feature_enabled(X86_FEATURE_FXSR)) { return membuf_write(&to, &fpu->state.fsave, sizeof(struct fregs_state)); } - fpstate_sanitize_xstate(fpu); + if (use_xsave()) { + struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) }; - if (to.left == sizeof(env)) { - convert_from_fxsr(to.p, target); - return 0; + /* Handle init state optimized xstate correctly */ + copy_xstate_to_uabi_buf(mb, &fpu->state.xsave, XSTATE_COPY_FP); + fx = &fxsave; + } else { + fx = &fpu->state.fxsave; } - convert_from_fxsr(&env, target); + __convert_from_fxsr(&env, target, fx); return membuf_write(&to, &env, sizeof(env)); } -- cgit v1.2.3 From afac9e894364418731d1d7e66c1118b31fd130e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:45 +0200 Subject: x86/fpu: Remove fpstate_sanitize_xstate() No more users. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.124819167@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 2 - arch/x86/kernel/fpu/xstate.c | 79 ------------------------------------- 2 files changed, 81 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 9ed984609595..0148791acc49 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -86,8 +86,6 @@ extern void fpstate_init_soft(struct swregs_state *soft); static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif -extern void fpstate_sanitize_xstate(struct fpu *fpu); - #define user_insn(insn, output, input...) \ ({ \ int err; \ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8d023a26ae22..e17fde9b1ca3 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -128,85 +128,6 @@ static bool xfeature_is_supervisor(int xfeature_nr) return ecx & 1; } -/* - * When executing XSAVEOPT (or other optimized XSAVE instructions), if - * a processor implementation detects that an FPU state component is still - * (or is again) in its initialized state, it may clear the corresponding - * bit in the header.xfeatures field, and can skip the writeout of registers - * to the corresponding memory layout. - * - * This means that when the bit is zero, the state component might still contain - * some previous - non-initialized register state. - * - * Before writing xstate information to user-space we sanitize those components, - * to always ensure that the memory layout of a feature will be in the init state - * if the corresponding header bit is zero. This is to ensure that user-space doesn't - * see some stale state in the memory layout during signal handling, debugging etc. - */ -void fpstate_sanitize_xstate(struct fpu *fpu) -{ - struct fxregs_state *fx = &fpu->state.fxsave; - int feature_bit; - u64 xfeatures; - - if (!use_xsaveopt()) - return; - - xfeatures = fpu->state.xsave.header.xfeatures; - - /* - * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is up to date. - */ - if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) - return; - - /* - * FP is in init state - */ - if (!(xfeatures & XFEATURE_MASK_FP)) { - fx->cwd = 0x37f; - fx->swd = 0; - fx->twd = 0; - fx->fop = 0; - fx->rip = 0; - fx->rdp = 0; - memset(fx->st_space, 0, sizeof(fx->st_space)); - } - - /* - * SSE is in init state - */ - if (!(xfeatures & XFEATURE_MASK_SSE)) - memset(fx->xmm_space, 0, sizeof(fx->xmm_space)); - - /* - * First two features are FPU and SSE, which above we handled - * in a special way already: - */ - feature_bit = 0x2; - xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; - - /* - * Update all the remaining memory layouts according to their - * standard xstate layout, if their header bit is in the init - * state: - */ - while (xfeatures) { - if (xfeatures & 0x1) { - int offset = xstate_comp_offsets[feature_bit]; - int size = xstate_sizes[feature_bit]; - - memcpy((void *)fx + offset, - (void *)&init_fpstate.xsave + offset, - size); - } - - xfeatures >>= 1; - feature_bit++; - } -} - /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. -- cgit v1.2.3 From 5a32fac8dbe8adc08c10e2c8770c95aebfc627cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:46 +0200 Subject: x86/fpu/regset: Move fpu__read_begin() into regset The function can only be used from the regset get() callbacks safely. So there is no reason to have it globally exposed. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.234942936@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 1 - arch/x86/kernel/fpu/core.c | 20 -------------------- arch/x86/kernel/fpu/regset.c | 22 +++++++++++++++++++--- 3 files changed, 19 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 0148791acc49..4c669f3b54b4 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,7 +26,6 @@ /* * High level FPU state handling functions: */ -extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9ff467bb4894..d390644e5fa1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -281,26 +281,6 @@ static void fpu__initialize(struct fpu *fpu) trace_x86_fpu_init_state(fpu); } -/* - * This function must be called before we read a task's fpstate. - * - * There's two cases where this gets called: - * - * - for the current task (when coredumping), in which case we have - * to save the latest FPU registers into the fpstate, - * - * - or it's called for stopped tasks (ptrace), in which case the - * registers were already saved by the context-switch code when - * the task scheduled out. - * - * If the task has used the FPU before then save it. - */ -void fpu__prepare_read(struct fpu *fpu) -{ - if (fpu == ¤t->thread.fpu) - fpu__save(fpu); -} - /* * This function must be called before we write a task's fpstate. * diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 4b799e42ebc7..937adf702b37 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -28,6 +28,22 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r return 0; } +/* + * The regset get() functions are invoked from: + * + * - coredump to dump the current task's fpstate. If the current task + * owns the FPU then the memory state has to be synchronized and the + * FPU register state preserved. Otherwise fpstate is already in sync. + * + * - ptrace to dump fpstate of a stopped task, in which case the registers + * have already been saved to fpstate on context switch. + */ +static void sync_fpstate(struct fpu *fpu) +{ + if (fpu == ¤t->thread.fpu) + fpu__save(fpu); +} + int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { @@ -36,7 +52,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; - fpu__prepare_read(fpu); + sync_fpstate(fpu); if (!use_xsave()) { return membuf_write(&to, &fpu->state.fxsave, @@ -96,7 +112,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__prepare_read(fpu); + sync_fpstate(fpu); copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE); return 0; @@ -287,7 +303,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; - fpu__prepare_read(fpu); + sync_fpstate(fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return fpregs_soft_get(target, regset, to); -- cgit v1.2.3 From dbb60ac764581e62f2116c5a6b8926ba3a872dd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:47 +0200 Subject: x86/fpu: Move fpu__write_begin() to regset The only usecase for fpu__write_begin is the set() callback of regset, so the function is pointlessly global. Move it to the regset code and rename it to fpu_force_restore() which is exactly decribing what the function does. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.328652975@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 1 - arch/x86/kernel/fpu/core.c | 24 ------------------------ arch/x86/kernel/fpu/regset.c | 25 ++++++++++++++++++++++--- 3 files changed, 22 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 4c669f3b54b4..854bfb03645e 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,7 +26,6 @@ /* * High level FPU state handling functions: */ -extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d390644e5fa1..22437011f4f0 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -281,30 +281,6 @@ static void fpu__initialize(struct fpu *fpu) trace_x86_fpu_init_state(fpu); } -/* - * This function must be called before we write a task's fpstate. - * - * Invalidate any cached FPU registers. - * - * After this function call, after registers in the fpstate are - * modified and the child task has woken up, the child task will - * restore the modified FPU state from the modified context. If we - * didn't clear its cached status here then the cached in-registers - * state pending on its former CPU could be restored, corrupting - * the modifications. - */ -void fpu__prepare_write(struct fpu *fpu) -{ - /* - * Only stopped child tasks can be used to modify the FPU - * state in the fpstate buffer: - */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); - - /* Invalidate any cached state: */ - __fpu_invalidate_fpregs_state(fpu); -} - /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 937adf702b37..ddc290d9bf8e 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -44,6 +44,25 @@ static void sync_fpstate(struct fpu *fpu) fpu__save(fpu); } +/* + * Invalidate cached FPU registers before modifying the stopped target + * task's fpstate. + * + * This forces the target task on resume to restore the FPU registers from + * modified fpstate. Otherwise the task might skip the restore and operate + * with the cached FPU registers which discards the modifications. + */ +static void fpu_force_restore(struct fpu *fpu) +{ + /* + * Only stopped child tasks can be used to modify the FPU + * state in the fpstate buffer: + */ + WARN_ON_FPU(fpu == ¤t->thread.fpu); + + __fpu_invalidate_fpregs_state(fpu); +} + int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { @@ -88,7 +107,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (newstate.mxcsr & ~mxcsr_feature_mask) return -EINVAL; - fpu__prepare_write(fpu); + fpu_force_restore(fpu); /* Copy the state */ memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate)); @@ -146,7 +165,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } } - fpu__prepare_write(fpu); + fpu_force_restore(fpu); ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); out: @@ -346,7 +365,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - fpu__prepare_write(fpu); + fpu_force_restore(fpu); if (cpu_feature_enabled(X86_FEATURE_FXSR)) convert_to_fxsr(&fpu->state.fxsave, &env); -- cgit v1.2.3 From 02b93c0b00df222b9ccf7a1fbd0eb59353d0a58c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:48 +0200 Subject: x86/fpu: Get rid of using_compacted_format() This function is pointlessly global and a complete misnomer because it's usage is related to both supervisor state checks and compacted format checks. Remove it and just make the conditions check the XSAVES feature. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.425493349@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/xstate.c | 22 ++++------------------ 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 732ae793c2ab..6e5ba42ffd42 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -101,7 +101,6 @@ extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); -int using_compacted_format(void); int xfeature_size(int xfeature_nr); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e17fde9b1ca3..185cc5d1aa33 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -449,20 +449,6 @@ int xfeature_size(int xfeature_nr) return eax; } -/* - * 'XSAVES' implies two different things: - * 1. saving of supervisor/system state - * 2. using the compacted format - * - * Use this function when dealing with the compacted format so - * that it is obvious which aspect of 'XSAVES' is being handled - * by the calling code. - */ -int using_compacted_format(void) -{ - return boot_cpu_has(X86_FEATURE_XSAVES); -} - /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr) { @@ -581,9 +567,9 @@ static void do_extra_xstate_size_checks(void) check_xstate_against_struct(i); /* * Supervisor state components can be managed only by - * XSAVES, which is compacted-format only. + * XSAVES. */ - if (!using_compacted_format()) + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_WARN_ON(xfeature_is_supervisor(i)); /* Align from the end of the previous feature */ @@ -593,9 +579,9 @@ static void do_extra_xstate_size_checks(void) * The offset of a given state in the non-compacted * format is given to us in a CPUID leaf. We check * them for being ordered (increasing offsets) in - * setup_xstate_features(). + * setup_xstate_features(). XSAVES uses compacted format. */ - if (!using_compacted_format()) + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) paranoid_xstate_size = xfeature_uncompacted_offset(i); /* * The compacted-format offset always depends on where -- cgit v1.2.3 From 71ef453355a9197fcfd8ff22391a4ad7861d79e6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 23 Jun 2021 14:01:49 +0200 Subject: x86/kvm: Avoid looking up PKRU in XSAVE buffer PKRU is being removed from the kernel XSAVE/FPU buffers. This removal will probably include warnings for code that look up PKRU in those buffers. KVM currently looks up the location of PKRU but doesn't even use the pointer that it gets back. Rework the code to avoid calling get_xsave_addr() except in cases where its result is actually used. This makes the code more clear and also avoids the inevitable PKRU warnings. This is probably a good cleanup and could go upstream idependently of any PKRU rework. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.541037562@linutronix.de --- arch/x86/kvm/x86.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e0f4a46649d7..c25bf240eb26 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4604,20 +4604,21 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { + u32 size, offset, ecx, edx; u64 xfeature_mask = valid & -valid; int xfeature_nr = fls64(xfeature_mask) - 1; - void *src = get_xsave_addr(xsave, xfeature_nr); - - if (src) { - u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - if (xfeature_nr == XFEATURE_PKRU) - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - else - memcpy(dest + offset, src, size); + void *src; + + cpuid_count(XSTATE_CPUID, xfeature_nr, + &size, &offset, &ecx, &edx); + if (xfeature_nr == XFEATURE_PKRU) { + memcpy(dest + offset, &vcpu->arch.pkru, + sizeof(vcpu->arch.pkru)); + } else { + src = get_xsave_addr(xsave, xfeature_nr); + if (src) + memcpy(dest + offset, src, size); } valid -= xfeature_mask; @@ -4647,18 +4648,20 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { + u32 size, offset, ecx, edx; u64 xfeature_mask = valid & -valid; int xfeature_nr = fls64(xfeature_mask) - 1; - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) { - u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - if (xfeature_nr == XFEATURE_PKRU) - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - else + + cpuid_count(XSTATE_CPUID, xfeature_nr, + &size, &offset, &ecx, &edx); + + if (xfeature_nr == XFEATURE_PKRU) { + memcpy(&vcpu->arch.pkru, src + offset, + sizeof(vcpu->arch.pkru)); + } else { + void *dest = get_xsave_addr(xsave, xfeature_nr); + + if (dest) memcpy(dest, src + offset, size); } -- cgit v1.2.3 From 9fe8a6f5eed8fff6b2d7dbc99b911334e311732d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:50 +0200 Subject: x86/fpu: Cleanup arch_set_user_pkey_access() The function does a sanity check with a WARN_ON_ONCE() but happily proceeds when the pkey argument is out of range. Clean it up. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.635764326@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 185cc5d1aa33..579e343e6654 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -912,11 +912,10 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) + unsigned long init_val) { - u32 old_pkru; - int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); - u32 new_pkru_bits = 0; + u32 old_pkru, new_pkru_bits = 0; + int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets @@ -930,7 +929,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * values originating from in-kernel users. Complain * if a bad value is observed. */ - WARN_ON_ONCE(pkey >= arch_max_pkey()); + if (WARN_ON_ONCE(pkey >= arch_max_pkey())) + return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) @@ -939,6 +939,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ + pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ -- cgit v1.2.3 From 1f3171252dc586745bb548d48f3bcedfea34b58d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:51 +0200 Subject: x86/fpu: Get rid of copy_supervisor_to_kernel() If the fast path of restoring the FPU state on sigreturn fails or is not taken and the current task's FPU is active then the FPU has to be deactivated for the slow path to allow a safe update of the tasks FPU memory state. With supervisor states enabled, this requires to save the supervisor state in the memory state first. Supervisor states require XSAVES so saving only the supervisor state requires to reshuffle the memory buffer because XSAVES uses the compacted format and therefore stores the supervisor states at the beginning of the memory state. That's just an overengineered optimization. Get rid of it and save the full state for this case. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.734561971@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 1 - arch/x86/kernel/fpu/signal.c | 13 +++++---- arch/x86/kernel/fpu/xstate.c | 55 --------------------------------------- 3 files changed, 8 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 6e5ba42ffd42..6611e069e834 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -104,7 +104,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); -void copy_supervisor_to_kernel(struct xregs_state *xsave); void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 888c8e0d2ff1..501059579066 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -401,15 +401,18 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * the optimisation). */ fpregs_lock(); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - /* - * Supervisor states are not modified by user space input. Save - * current supervisor states first and invalidate the FPU regs. + * If supervisor states are available then save the + * hardware state in current's fpstate so that the + * supervisor state is preserved. Save the full state for + * simplicity. There is no point in optimizing this by only + * saving the supervisor states and then shuffle them to + * the right place in memory. This is the slow path and the + * above XRSTOR failed or ia32_fxstate is true. Shrug. */ if (xfeatures_mask_supervisor()) - copy_supervisor_to_kernel(&fpu->state.xsave); + copy_xregs_to_kernel(&fpu->state.xsave); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 579e343e6654..427977b52c6c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1204,61 +1204,6 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } -/* - * Save only supervisor states to the kernel buffer. This blows away all - * old states, and is intended to be used only in __fpu__restore_sig(), where - * user states are restored from the user buffer. - */ -void copy_supervisor_to_kernel(struct xregs_state *xstate) -{ - struct xstate_header *header; - u64 max_bit, min_bit; - u32 lmask, hmask; - int err, i; - - if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) - return; - - if (!xfeatures_mask_supervisor()) - return; - - max_bit = __fls(xfeatures_mask_supervisor()); - min_bit = __ffs(xfeatures_mask_supervisor()); - - lmask = xfeatures_mask_supervisor(); - hmask = xfeatures_mask_supervisor() >> 32; - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - if (WARN_ON_FPU(err)) - return; - - /* - * At this point, the buffer has only supervisor states and must be - * converted back to normal kernel format. - */ - header = &xstate->header; - header->xcomp_bv |= xfeatures_mask_all; - - /* - * This only moves states up in the buffer. Start with - * the last state and move backwards so that states are - * not overwritten until after they are moved. Note: - * memmove() allows overlapping src/dst buffers. - */ - for (i = max_bit; i >= min_bit; i--) { - u8 *xbuf = (u8 *)xstate; - - if (!((header->xfeatures >> i) & 1)) - continue; - - /* Move xfeature 'i' into its normal location */ - memmove(xbuf + xstate_comp_offsets[i], - xbuf + xstate_supervisor_only_offsets[i], - xstate_sizes[i]); - } -} - /** * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to * an xsave area -- cgit v1.2.3 From b16313f71c1050ad5c92548925e0e9cec26989ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:52 +0200 Subject: x86/fpu: Rename copy_xregs_to_kernel() and copy_kernel_to_xregs() The function names for xsave[s]/xrstor[s] operations are horribly named and a permanent source of confusion. Rename: copy_xregs_to_kernel() to os_xsave() copy_kernel_to_xregs() to os_xrstor() These are truly low level wrappers around the actual instructions XSAVE[OPT]/XRSTOR and XSAVES/XRSTORS with the twist that the selection based on the available CPU features happens with an alternative to avoid conditionals all over the place and to provide the best performance for hot paths. The os_ prefix tells that this is the OS selected mechanism. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.830239347@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 17 +++++++++++------ arch/x86/kernel/fpu/core.c | 7 +++---- arch/x86/kernel/fpu/signal.c | 21 +++++++++++---------- arch/x86/kernel/fpu/xstate.c | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 854bfb03645e..dae15455914d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -261,7 +261,7 @@ static inline void fxsave(struct fxregs_state *fx) * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ -static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) +static inline void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; @@ -284,8 +284,11 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) /* * Save processor xstate to xsave area. + * + * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features + * and command line options. The choice is permanent until the next reboot. */ -static inline void copy_xregs_to_kernel(struct xregs_state *xstate) +static inline void os_xsave(struct xregs_state *xstate) { u64 mask = xfeatures_mask_all; u32 lmask = mask; @@ -302,8 +305,10 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) /* * Restore processor xstate from xsave area. + * + * Uses XRSTORS when XSAVES is used, XRSTOR otherwise. */ -static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) +static inline void os_xrstor(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; @@ -364,13 +369,13 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) * Restore xstate from kernel space xsave area, return an error code instead of * an exception. */ -static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) +static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; int err; - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -383,7 +388,7 @@ extern int copy_fpregs_to_fpstate(struct fpu *fpu); static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { - copy_kernel_to_xregs(&fpstate->xsave, mask); + os_xrstor(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 22437011f4f0..bfdcf7fd63e3 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -95,7 +95,7 @@ EXPORT_SYMBOL(irq_fpu_usable); int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { - copy_xregs_to_kernel(&fpu->state.xsave); + os_xsave(&fpu->state.xsave); /* * AVX512 state is tracked here because its use is @@ -314,7 +314,7 @@ void fpu__drop(struct fpu *fpu) static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); + os_xrstor(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else @@ -345,8 +345,7 @@ static void fpu__clear(struct fpu *fpu, bool user_only) if (user_only) { if (!fpregs_state_valid(fpu, smp_processor_id()) && xfeatures_mask_supervisor()) - copy_kernel_to_xregs(&fpu->state.xsave, - xfeatures_mask_supervisor()); + os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); copy_init_fpstate_to_fpregs(xfeatures_mask_user()); } else { copy_init_fpstate_to_fpregs(xfeatures_mask_all); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 501059579066..33675b3dad0e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -261,14 +261,14 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) r = copy_user_to_fxregs(buf); if (!r) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate.xsave, init_bv); return r; } else { init_bv = xfeatures_mask_user() & ~xbv; r = copy_user_to_xregs(buf, xbv); if (!r && unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate.xsave, init_bv); return r; } } else if (use_fxsr()) { @@ -356,9 +356,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * has been copied to the kernel one. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && - xfeatures_mask_supervisor()) - copy_kernel_to_xregs(&fpu->state.xsave, - xfeatures_mask_supervisor()); + xfeatures_mask_supervisor()) { + os_xrstor(&fpu->state.xsave, + xfeatures_mask_supervisor()); + } fpregs_mark_activate(); fpregs_unlock(); return 0; @@ -412,7 +413,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * above XRSTOR failed or ia32_fxstate is true. Shrug. */ if (xfeatures_mask_supervisor()) - copy_xregs_to_kernel(&fpu->state.xsave); + os_xsave(&fpu->state.xsave); set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); @@ -430,14 +431,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_lock(); if (unlikely(init_bv)) - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate.xsave, init_bv); /* * Restore previously saved supervisor xstates along with * copied-in user xstates. */ - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, - user_xfeatures | xfeatures_mask_supervisor()); + ret = os_xrstor_safe(&fpu->state.xsave, + user_xfeatures | xfeatures_mask_supervisor()); } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); @@ -454,7 +455,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) u64 init_bv; init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + os_xrstor(&init_fpstate.xsave, init_bv); } ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 427977b52c6c..4fca8a8edff8 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -400,7 +400,7 @@ static void __init setup_init_fpu_buf(void) /* * Init all the features state with header.xfeatures being 0x0 */ - copy_kernel_to_xregs_booting(&init_fpstate.xsave); + os_xrstor_booting(&init_fpstate.xsave); /* * All components are now in init state. Read the state back so -- cgit v1.2.3 From 6b862ba1821441e6083cf061404694d33a841526 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:53 +0200 Subject: x86/fpu: Rename copy_user_to_xregs() and copy_xregs_to_user() The function names for xsave[s]/xrstor[s] operations are horribly named and a permanent source of confusion. Rename: copy_xregs_to_user() to xsave_to_user_sigframe() copy_user_to_xregs() to xrstor_from_user_sigframe() so it's entirely clear what this is about. This is also a clear indicator of the potentially different storage format because this is user ABI and cannot use compacted format. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121453.924266705@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/signal.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index dae15455914d..a2ab74478d1d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -326,7 +326,7 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask) * backward compatibility for old applications which don't understand * compacted format of xsave area. */ -static inline int copy_xregs_to_user(struct xregs_state __user *buf) +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) { u64 mask = xfeatures_mask_user(); u32 lmask = mask; @@ -351,7 +351,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) /* * Restore xstate from user space xsave area. */ -static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) +static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 33675b3dad0e..4fe632f56697 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -129,7 +129,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) int err; if (use_xsave()) - err = copy_xregs_to_user(buf); + err = xsave_to_user_sigframe(buf); else if (use_fxsr()) err = copy_fxregs_to_user((struct fxregs_state __user *) buf); else @@ -266,7 +266,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) } else { init_bv = xfeatures_mask_user() & ~xbv; - r = copy_user_to_xregs(buf, xbv); + r = xrstor_from_user_sigframe(buf, xbv); if (!r && unlikely(init_bv)) os_xrstor(&init_fpstate.xsave, init_bv); return r; -- cgit v1.2.3 From 16dcf4385933a02bb21d0af86a04439d151ad42a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:54 +0200 Subject: x86/fpu: Rename fxregs-related copy functions The function names for fxsave/fxrstor operations are horribly named and a permanent source of confusion. Rename: copy_fxregs_to_kernel() to fxsave() copy_kernel_to_fxregs() to fxrstor() copy_fxregs_to_user() to fxsave_to_user_sigframe() copy_user_to_fxregs() to fxrstor_from_user_sigframe() so it's clear what these are doing. All these functions are really low level wrappers around the equally named instructions, so mapping to the documentation is just natural. While at it, replace the static_cpu_has(X86_FEATURE_FXSR) with use_fxsr() to be consistent with the rest of the code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.017863494@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 18 +++++------------- arch/x86/kernel/fpu/core.c | 6 +++--- arch/x86/kernel/fpu/signal.c | 10 +++++----- 3 files changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a2ab74478d1d..7806f39de3cb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -129,7 +129,7 @@ static inline int copy_fregs_to_user(struct fregs_state __user *fx) return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } -static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) +static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); @@ -138,7 +138,7 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) } -static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) +static inline void fxrstor(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -146,7 +146,7 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx) +static inline int fxrstor_safe(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -154,7 +154,7 @@ static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx) return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) +static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -177,14 +177,6 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx) return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline void copy_fxregs_to_kernel(struct fpu *fpu) -{ - if (IS_ENABLED(CONFIG_X86_32)) - asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); - else - asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); -} - static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) @@ -391,7 +383,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask os_xrstor(&fpstate->xsave, mask); } else { if (use_fxsr()) - copy_kernel_to_fxregs(&fpstate->fxsave); + fxrstor(&fpstate->fxsave); else copy_kernel_to_fregs(&fpstate->fsave); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index bfdcf7fd63e3..035487dcb29e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -107,7 +107,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu) } if (likely(use_fxsr())) { - copy_fxregs_to_kernel(fpu); + fxsave(&fpu->state.fxsave); return 1; } @@ -315,8 +315,8 @@ static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate.xsave, features_mask); - else if (static_cpu_has(X86_FEATURE_FXSR)) - copy_kernel_to_fxregs(&init_fpstate.fxsave); + else if (use_fxsr()) + fxrstor(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 4fe632f56697..05f8445eb676 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -64,7 +64,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - copy_fxregs_to_kernel(&tsk->thread.fpu); + fxsave(&tsk->thread.fpu.state.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -131,7 +131,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) if (use_xsave()) err = xsave_to_user_sigframe(buf); else if (use_fxsr()) - err = copy_fxregs_to_user((struct fxregs_state __user *) buf); + err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else err = copy_fregs_to_user((struct fregs_state __user *) buf); @@ -259,7 +259,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) if (fx_only) { init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; - r = copy_user_to_fxregs(buf); + r = fxrstor_from_user_sigframe(buf); if (!r) os_xrstor(&init_fpstate.xsave, init_bv); return r; @@ -272,7 +272,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) return r; } } else if (use_fxsr()) { - return copy_user_to_fxregs(buf); + return fxrstor_from_user_sigframe(buf); } else return copy_user_to_fregs(buf); } @@ -458,7 +458,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) os_xrstor(&init_fpstate.xsave, init_bv); } - ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); + ret = fxrstor_safe(&fpu->state.fxsave); } else { ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); if (ret) -- cgit v1.2.3 From 872c65dbf669b3b471b3d8656391a6b4f736d22b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:55 +0200 Subject: x86/math-emu: Rename frstor() This is in the way of renaming the low level hardware accessors to match the instruction name. Prepend it with FPU_ which is consistent vs. the rest of the emulation code. No functional change. [ bp: Correct the Reported-by: ] Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.111665161@linutronix.de --- arch/x86/math-emu/fpu_proto.h | 2 +- arch/x86/math-emu/load_store.c | 2 +- arch/x86/math-emu/reg_ld_str.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 70d35c200945..94c4023092f3 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -144,7 +144,7 @@ extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); extern int FPU_round_to_int(FPU_REG *r, u_char tag); extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s); -extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address); +extern void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address); extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d); extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address); extern int FPU_tagof(FPU_REG *ptr); diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index f15263e158e8..4092df79de4f 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -240,7 +240,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, fix-up operations. */ return 1; case 022: /* frstor m94/108byte */ - frstor(addr_modes, (u_char __user *) data_address); + FPU_frstor(addr_modes, (u_char __user *) data_address); /* Ensure that the values just loaded are not changed by fix-up operations. */ return 1; diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index 7ca6417c0c8d..7e4521fbe7da 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -1117,7 +1117,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) return s; } -void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) +void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address) { int i, regnr; u_char __user *s = fldenv(addr_modes, data_address); -- cgit v1.2.3 From 6fdc908cb56123591baa4259400cfb0787582b11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:56 +0200 Subject: x86/fpu: Rename fregs-related copy functions The function names for fnsave/fnrstor operations are horribly named and a permanent source of confusion. Rename: copy_kernel_to_fregs() to frstor() copy_fregs_to_user() to fnsave_to_user_sigframe() copy_user_to_fregs() to frstor_from_user_sigframe() so it's clear what these are doing. All these functions are really low level wrappers around the equally named instructions, so mapping to the documentation is just natural. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.223594101@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 10 +++++----- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 7806f39de3cb..cd88233f1057 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -124,7 +124,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ : output : input) -static inline int copy_fregs_to_user(struct fregs_state __user *fx) +static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } @@ -162,17 +162,17 @@ static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline void copy_kernel_to_fregs(struct fregs_state *fx) +static inline void frstor(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int copy_kernel_to_fregs_err(struct fregs_state *fx) +static inline int frstor_safe(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } -static inline int copy_user_to_fregs(struct fregs_state __user *fx) +static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } @@ -385,7 +385,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask if (use_fxsr()) fxrstor(&fpstate->fxsave); else - copy_kernel_to_fregs(&fpstate->fsave); + frstor(&fpstate->fsave); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 035487dcb29e..1d2587607a7f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -318,7 +318,7 @@ static inline void copy_init_fpstate_to_fpregs(u64 features_mask) else if (use_fxsr()) fxrstor(&init_fpstate.fxsave); else - copy_kernel_to_fregs(&init_fpstate.fsave); + frstor(&init_fpstate.fsave); if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 05f8445eb676..430c66dc2218 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -133,7 +133,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) else if (use_fxsr()) err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else - err = copy_fregs_to_user((struct fregs_state __user *) buf); + err = fnsave_to_user_sigframe((struct fregs_state __user *) buf); if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) err = -EFAULT; @@ -274,7 +274,7 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); } else - return copy_user_to_fregs(buf); + return frstor_from_user_sigframe(buf); } static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) @@ -465,7 +465,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) goto out; fpregs_lock(); - ret = copy_kernel_to_fregs_err(&fpu->state.fsave); + ret = frstor_safe(&fpu->state.fsave); } if (!ret) fpregs_mark_activate(); -- cgit v1.2.3 From 1cc34413ff3f18c30e5df89fefd95cc0f3b3292e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:57 +0200 Subject: x86/fpu: Rename xstate copy functions which are related to UABI Rename them to reflect that these functions deal with user space format XSAVE buffers. copy_kernel_to_xstate() -> copy_uabi_from_kernel_to_xstate() copy_user_to_xstate() -> copy_sigframe_from_user_to_xstate() Again a clear statement that these functions deal with user space ABI. Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.318485015@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 4 ++-- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 5 +++-- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 6611e069e834..00e1a2ac5239 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -102,8 +102,8 @@ extern void __init update_regset_xstate_info(unsigned int size, void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); -int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); -int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); +int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); +int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index ddc290d9bf8e..892aec1dd822 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -166,7 +166,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } fpu_force_restore(fpu); - ret = copy_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); + ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf); out: vfree(tmpbuf); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 430c66dc2218..fd4b58d7b72e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -422,7 +422,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave() && !fx_only) { u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; - ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) goto out; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4fca8a8edff8..57674f8c2342 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1099,7 +1099,7 @@ static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr) * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format * and copy to the target thread. This is called from xstateregs_set(). */ -int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) { unsigned int offset, size; int i; @@ -1154,7 +1154,8 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ -int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) +int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, + const void __user *ubuf) { unsigned int offset, size; int i; -- cgit v1.2.3 From 522e92743b35351bda1b6a9136560f833a9c2490 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:58 +0200 Subject: x86/fpu: Deduplicate copy_uabi_from_user/kernel_to_xstate() copy_uabi_from_user_to_xstate() and copy_uabi_from_kernel_to_xstate() are almost identical except for the copy function. Unify them. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20210623121454.414215896@linutronix.de --- arch/x86/kernel/fpu/xstate.c | 137 +++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 90 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 57674f8c2342..0eb42a1b11e1 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -953,23 +953,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ -/* - * Weird legacy quirk: SSE and YMM states store information in the - * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP - * area is marked as unused in the xfeatures header, we need to copy - * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use. - */ -static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) -{ - if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM))) - return false; - - if (xfeatures & XFEATURE_MASK_FP) - return false; - - return true; -} - static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { @@ -1082,39 +1065,53 @@ out: membuf_zero(&to, to.left); } -static inline bool mxcsr_valid(struct xstate_header *hdr, const u32 *mxcsr) +static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, + const void *kbuf, const void __user *ubuf) { - u64 mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; - - /* Only check if it is in use */ - if (hdr->xfeatures & mask) { - /* Reserved bits in MXCSR must be zero. */ - if (*mxcsr & ~mxcsr_feature_mask) - return false; + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; } - return true; + return 0; } -/* - * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] format - * and copy to the target thread. This is called from xstateregs_set(). - */ -int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) + +static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf, + const void __user *ubuf) { unsigned int offset, size; - int i; struct xstate_header hdr; + u64 mask; + int i; offset = offsetof(struct xregs_state, header); - size = sizeof(hdr); - - memcpy(&hdr, kbuf + offset, size); + if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) + return -EFAULT; if (validate_user_xstate_header(&hdr)) return -EINVAL; - if (!mxcsr_valid(&hdr, kbuf + offsetof(struct fxregs_state, mxcsr))) - return -EINVAL; + /* Validate MXCSR when any of the related features is in use */ + mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; + if (hdr.xfeatures & mask) { + u32 mxcsr[2]; + + offset = offsetof(struct fxregs_state, mxcsr); + if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) + return -EFAULT; + + /* Reserved bits in MXCSR must be zero. */ + if (mxcsr[0] & ~mxcsr_feature_mask) + return -EINVAL; + + /* SSE and YMM require MXCSR even when FP is not in use. */ + if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { + xsave->i387.mxcsr = mxcsr[0]; + xsave->i387.mxcsr_mask = mxcsr[1]; + } + } for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = ((u64)1 << i); @@ -1125,16 +1122,11 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) offset = xstate_offsets[i]; size = xstate_sizes[i]; - memcpy(dst, kbuf + offset, size); + if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) + return -EFAULT; } } - if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - memcpy(&xsave->i387.mxcsr, kbuf + offset, size); - } - /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': @@ -1149,6 +1141,16 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) return 0; } +/* + * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] + * format and copy to the target thread. This is called from + * xstateregs_set(). + */ +int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) +{ + return copy_uabi_to_xstate(xsave, kbuf, NULL); +} + /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the @@ -1157,52 +1159,7 @@ int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) { - unsigned int offset, size; - int i; - struct xstate_header hdr; - - offset = offsetof(struct xregs_state, header); - size = sizeof(hdr); - - if (copy_from_user(&hdr, ubuf + offset, size)) - return -EFAULT; - - if (validate_user_xstate_header(&hdr)) - return -EINVAL; - - for (i = 0; i < XFEATURE_MAX; i++) { - u64 mask = ((u64)1 << i); - - if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, i); - - offset = xstate_offsets[i]; - size = xstate_sizes[i]; - - if (copy_from_user(dst, ubuf + offset, size)) - return -EFAULT; - } - } - - if (xfeatures_mxcsr_quirk(hdr.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - if (copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size)) - return -EFAULT; - } - - /* - * The state that came in from userspace was user-state only. - * Mask all the user states out of 'xfeatures': - */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; - - /* - * Add back in the features that came in from userspace: - */ - xsave->header.xfeatures |= hdr.xfeatures; - - return 0; + return copy_uabi_to_xstate(xsave, NULL, ubuf); } /** -- cgit v1.2.3 From ebe7234b08a42d69bae94c4062a84777ea26ef99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:01:59 +0200 Subject: x86/fpu: Rename copy_fpregs_to_fpstate() to save_fpregs_to_fpstate() A copy is guaranteed to leave the source intact, which is not the case when FNSAVE is used as that reinitilizes the registers. Save does not make such guarantees and it matches what this is about, i.e. to save the state for a later restore. Rename it to save_fpregs_to_fpstate(). Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.508853062@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/kernel/fpu/core.c | 10 +++++----- arch/x86/kvm/x86.c | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index cd88233f1057..559dd3018fd4 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -375,7 +375,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) return err; } -extern int copy_fpregs_to_fpstate(struct fpu *fpu); +extern int save_fpregs_to_fpstate(struct fpu *fpu); static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { @@ -507,7 +507,7 @@ static inline void __fpregs_load_activate(void) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { - if (!copy_fpregs_to_fpstate(old_fpu)) + if (!save_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1d2587607a7f..dcf4d6b58180 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -92,7 +92,7 @@ EXPORT_SYMBOL(irq_fpu_usable); * Modern FPU state can be kept in registers, if there are * no pending FP exceptions. */ -int copy_fpregs_to_fpstate(struct fpu *fpu) +int save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(&fpu->state.xsave); @@ -119,7 +119,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu) return 0; } -EXPORT_SYMBOL(copy_fpregs_to_fpstate); +EXPORT_SYMBOL(save_fpregs_to_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { @@ -137,7 +137,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) * Ignore return value -- we don't care if reg state * is clobbered. */ - copy_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); @@ -172,7 +172,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - if (!copy_fpregs_to_fpstate(fpu)) { + if (!save_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } } @@ -255,7 +255,7 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) if (test_thread_flag(TIF_NEED_FPU_LOAD)) memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - else if (!copy_fpregs_to_fpstate(dst_fpu)) + else if (!save_fpregs_to_fpstate(dst_fpu)) copy_kernel_to_fpregs(&dst_fpu->state); fpregs_unlock(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c25bf240eb26..71bacb73c115 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9637,7 +9637,7 @@ static void kvm_save_current_fpu(struct fpu *fpu) memcpy(&fpu->state, ¤t->thread.fpu.state, fpu_kernel_xstate_size); else - copy_fpregs_to_fpstate(fpu); + save_fpregs_to_fpstate(fpu); } /* Swap (qemu) user FPU context for the guest FPU context. */ -- cgit v1.2.3 From 08ded2cd18a09749e67a14426aa7fd1b04ab1dc0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:00 +0200 Subject: x86/fpu: Get rid of the FNSAVE optimization The FNSAVE support requires conditionals in quite some call paths because FNSAVE reinitializes the FPU hardware. If the save has to preserve the FPU register state then the caller has to conditionally restore it from memory when FNSAVE is in use. This also requires a conditional in context switch because the restore avoidance optimization cannot work with FNSAVE. As this only affects 20+ years old CPUs there is really no reason to keep this optimization effective for FNSAVE. It's about time to not optimize for antiques anymore. Just unconditionally FRSTOR the save content to the registers and clean up the conditionals all over the place. Suggested-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.617369268@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 18 ++++++++----- arch/x86/kernel/fpu/core.c | 54 ++++++++++++++++--------------------- 2 files changed, 34 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 559dd3018fd4..3e0eedaacdac 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -83,6 +83,7 @@ extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif +extern void save_fpregs_to_fpstate(struct fpu *fpu); #define user_insn(insn, output, input...) \ ({ \ @@ -375,8 +376,6 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) return err; } -extern int save_fpregs_to_fpstate(struct fpu *fpu); - static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { @@ -507,12 +506,17 @@ static inline void __fpregs_load_activate(void) static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { - if (!save_fpregs_to_fpstate(old_fpu)) - old_fpu->last_cpu = -1; - else - old_fpu->last_cpu = cpu; + save_fpregs_to_fpstate(old_fpu); + /* + * The save operation preserved register state, so the + * fpu_fpregs_owner_ctx is still @old_fpu. Store the + * current CPU number in @old_fpu, so the next return + * to user space can avoid the FPU register restore + * when is returns on the same CPU and still owns the + * context. + */ + old_fpu->last_cpu = cpu; - /* But leave fpu_fpregs_owner_ctx! */ trace_x86_fpu_regs_deactivated(old_fpu); } } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index dcf4d6b58180..c290ba27ffef 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -83,16 +83,20 @@ bool irq_fpu_usable(void) EXPORT_SYMBOL(irq_fpu_usable); /* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact and we can - * keep registers active. + * Save the FPU register state in fpu->state. The register state is + * preserved. * - * The legacy FNSAVE instruction cleared all FPU state - * unconditionally, so registers are essentially destroyed. - * Modern FPU state can be kept in registers, if there are - * no pending FP exceptions. + * Must be called with fpregs_lock() held. + * + * The legacy FNSAVE instruction clears all FPU state unconditionally, so + * register state has to be reloaded. That might be a pointless exercise + * when the FPU is going to be used by another task right after that. But + * this only affects 20+ years old 32bit systems and avoids conditionals all + * over the place. + * + * FXSAVE and all XSAVE variants preserve the FPU register state. */ -int save_fpregs_to_fpstate(struct fpu *fpu) +void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(&fpu->state.xsave); @@ -103,21 +107,20 @@ int save_fpregs_to_fpstate(struct fpu *fpu) */ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; - return 1; + return; } if (likely(use_fxsr())) { fxsave(&fpu->state.fxsave); - return 1; + return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, - * so we have to mark them inactive: + * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - - return 0; + frstor(&fpu->state.fsave); } EXPORT_SYMBOL(save_fpregs_to_fpstate); @@ -133,10 +136,6 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) if (!(current->flags & PF_KTHREAD) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ save_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); @@ -171,11 +170,8 @@ void fpu__save(struct fpu *fpu) fpregs_lock(); trace_x86_fpu_before_save(fpu); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - if (!save_fpregs_to_fpstate(fpu)) { - copy_kernel_to_fpregs(&fpu->state); - } - } + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); @@ -244,20 +240,16 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src) memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* - * If the FPU registers are not current just memcpy() the state. - * Otherwise save current FPU registers directly into the child's FPU - * context, without any memory-to-memory copying. - * - * ( The function 'fails' in the FNSAVE case, which destroys - * register contents so we have to load them back. ) + * If the FPU registers are not owned by current just memcpy() the + * state. Otherwise save the FPU registers directly into the + * child's FPU context, without any memory-to-memory copying. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); - else if (!save_fpregs_to_fpstate(dst_fpu)) - copy_kernel_to_fpregs(&dst_fpu->state); - + else + save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); -- cgit v1.2.3 From 1c61fada304c125c3f8a2b8eb1896406e4098a05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:01 +0200 Subject: x86/fpu: Rename copy_kernel_to_fpregs() to restore_fpregs_from_fpstate() This is not a copy functionality. It restores the register state from the supplied kernel buffer. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.716058365@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 8 ++++---- arch/x86/kvm/x86.c | 4 ++-- arch/x86/mm/extable.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3e0eedaacdac..3558cd05c62f 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -376,7 +376,7 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) return err; } -static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) +static inline void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { os_xrstor(&fpstate->xsave, mask); @@ -388,7 +388,7 @@ static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask } } -static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) +static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is @@ -403,7 +403,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) : : [addr] "m" (fpstate)); } - __copy_kernel_to_fpregs(fpstate, -1); + __restore_fpregs_from_fpstate(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); @@ -474,7 +474,7 @@ static inline void __fpregs_load_activate(void) return; if (!fpregs_state_valid(fpu, cpu)) { - copy_kernel_to_fpregs(&fpu->state); + restore_fpregs_from_fpstate(&fpu->state); fpregs_activate(fpu); fpu->last_cpu = cpu; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 71bacb73c115..6f651b9d774c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9653,7 +9653,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) */ if (vcpu->arch.guest_fpu) /* PKRU is separately restored in kvm_x86_ops.run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, + __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); fpregs_mark_activate(); @@ -9674,7 +9674,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu) kvm_save_current_fpu(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); + restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); fpregs_unlock(); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 121921b2927c..2c5cccd77b8b 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -65,7 +65,7 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __copy_kernel_to_fpregs(&init_fpstate, -1); + __restore_fpregs_from_fpstate(&init_fpstate, -1); return true; } EXPORT_SYMBOL_GPL(ex_handler_fprestore); -- cgit v1.2.3 From b76411b1b568311bfd89d03acc587ffc1548c26f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:02 +0200 Subject: x86/fpu: Rename initstate copy functions Again this not a copy. It's restoring register state from kernel memory. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121454.816581630@linutronix.de --- arch/x86/kernel/fpu/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c290ba27ffef..4a59e0fbcfd8 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -303,7 +303,7 @@ void fpu__drop(struct fpu *fpu) * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(u64 features_mask) +static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate.xsave, features_mask); @@ -338,9 +338,9 @@ static void fpu__clear(struct fpu *fpu, bool user_only) if (!fpregs_state_valid(fpu, smp_processor_id()) && xfeatures_mask_supervisor()) os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); - copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + restore_fpregs_from_init_fpstate(xfeatures_mask_user()); } else { - copy_init_fpstate_to_fpregs(xfeatures_mask_all); + restore_fpregs_from_init_fpstate(xfeatures_mask_all); } fpregs_mark_activate(); -- cgit v1.2.3 From 01707b66535872f7a0d87f66078fd018d1814be0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 23 Jun 2021 14:02:03 +0200 Subject: x86/fpu: Rename "dynamic" XSTATEs to "independent" The salient feature of "dynamic" XSTATEs is that they are not part of the main task XSTATE buffer. The fact that they are dynamically allocated is irrelevant and will become quite confusing when user math XSTATEs start being dynamically allocated. Rename them to "independent" because they are independent of the main XSTATE code. This is just a search-and-replace with some whitespace updates to keep things aligned. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/1eecb0e4f3e07828ebe5d737ec77dc3b708fad2d.1623388344.git.luto@kernel.org Link: https://lkml.kernel.org/r/20210623121454.911450390@linutronix.de --- arch/x86/events/intel/lbr.c | 6 ++-- arch/x86/include/asm/fpu/xstate.h | 22 +++++++------- arch/x86/kernel/fpu/xstate.c | 62 +++++++++++++++++++-------------------- 3 files changed, 45 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4409d2cccfda..2fa2df3d4e9b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -491,7 +491,7 @@ static void intel_pmu_arch_lbr_xrstors(void *ctx) { struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; - copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); + copy_kernel_to_independent_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); } static __always_inline bool lbr_is_reset_in_cstate(void *ctx) @@ -576,7 +576,7 @@ static void intel_pmu_arch_lbr_xsaves(void *ctx) { struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; - copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); + copy_independent_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); } static void __intel_pmu_lbr_save(void *ctx) @@ -992,7 +992,7 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) intel_pmu_store_lbr(cpuc, NULL); return; } - copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); + copy_independent_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); intel_pmu_store_lbr(cpuc, xsave->lbr.entries); } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 00e1a2ac5239..a55bd5cabb59 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -42,21 +42,21 @@ * and its size may be huge. Saving/restoring such supervisor state components * at each context switch can cause high CPU and space overhead, which should * be avoided. Such supervisor state components should only be saved/restored - * on demand. The on-demand dynamic supervisor features are set in this mask. + * on demand. The on-demand supervisor features are set in this mask. * - * Unlike the existing supported supervisor features, a dynamic supervisor + * Unlike the existing supported supervisor features, an independent supervisor * feature does not allocate a buffer in task->fpu, and the corresponding * supervisor state component cannot be saved/restored at each context switch. * - * To support a dynamic supervisor feature, a developer should follow the + * To support an independent supervisor feature, a developer should follow the * dos and don'ts as below: * - Do dynamically allocate a buffer for the supervisor state component. * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the * state component to/from the buffer. - * - Don't set the bit corresponding to the dynamic supervisor feature in + * - Don't set the bit corresponding to the independent supervisor feature in * IA32_XSS at run time, since it has been set at boot time. */ -#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR) +#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR) /* * Unsupported supervisor features. When a supervisor feature in this mask is @@ -66,7 +66,7 @@ /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ - XFEATURE_MASK_DYNAMIC | \ + XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 @@ -87,12 +87,12 @@ static inline u64 xfeatures_mask_user(void) return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; } -static inline u64 xfeatures_mask_dynamic(void) +static inline u64 xfeatures_mask_independent(void) { if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR; + return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_DYNAMIC; + return XFEATURE_MASK_INDEPENDENT; } extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; @@ -104,8 +104,8 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); -void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); -void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); +void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); +void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask); enum xstate_copy_mode { XSTATE_COPY_FP, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 0eb42a1b11e1..27246a83ecb9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -151,7 +151,7 @@ void fpu__init_cpu_xstate(void) */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | - xfeatures_mask_dynamic()); + xfeatures_mask_independent()); } } @@ -551,7 +551,7 @@ static void check_xstate_against_struct(int nr) * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * - * Dynamic XSAVE features allocate their own buffers and are not + * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ @@ -617,18 +617,18 @@ static unsigned int __init get_xsaves_size(void) } /* - * Get the total size of the enabled xstates without the dynamic supervisor + * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_dynamic(void) +static unsigned int __init get_xsaves_size_no_independent(void) { - u64 mask = xfeatures_mask_dynamic(); + u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_xsaves_size(); - /* Disable dynamic features. */ + /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* @@ -637,7 +637,7 @@ static unsigned int __init get_xsaves_size_no_dynamic(void) */ size = get_xsaves_size(); - /* Re-enable dynamic features so XSAVES will work on them again. */ + /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; @@ -680,7 +680,7 @@ static int __init init_xstate_size(void) xsave_size = get_xsave_size(); if (boot_cpu_has(X86_FEATURE_XSAVES)) - possible_xstate_size = get_xsaves_size_no_dynamic(); + possible_xstate_size = get_xsaves_size_no_independent(); else possible_xstate_size = xsave_size; @@ -837,7 +837,7 @@ void fpu__resume_cpu(void) */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | - xfeatures_mask_dynamic()); + xfeatures_mask_independent()); } } @@ -1163,34 +1163,34 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, } /** - * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to - * an xsave area + * copy_independent_supervisor_to_kernel() - Save independent supervisor states to + * an xsave area * @xstate: A pointer to an xsave area - * @mask: Represent the dynamic supervisor features saved into the xsave area + * @mask: Represent the independent supervisor features saved into the xsave area * - * Only the dynamic supervisor states sets in the mask are saved into the xsave - * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic - * supervisor feature). Besides the dynamic supervisor states, the legacy + * Only the independent supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of independent + * supervisor feature). Besides the independent supervisor states, the legacy * region and XSAVE header are also saved into the xsave area. The supervisor * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. * * The xsave area must be 64-bytes aligned. */ -void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) { - u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u64 independent_mask = xfeatures_mask_independent() & mask; u32 lmask, hmask; int err; if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) return; - if (WARN_ON_FPU(!dynamic_mask)) + if (WARN_ON_FPU(!independent_mask)) return; - lmask = dynamic_mask; - hmask = dynamic_mask >> 32; + lmask = independent_mask; + hmask = independent_mask >> 32; XSTATE_OP(XSAVES, xstate, lmask, hmask, err); @@ -1199,34 +1199,34 @@ void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) } /** - * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from - * an xsave area + * copy_kernel_to_independent_supervisor() - Restore independent supervisor states from + * an xsave area * @xstate: A pointer to an xsave area - * @mask: Represent the dynamic supervisor features restored from the xsave area + * @mask: Represent the independent supervisor features restored from the xsave area * - * Only the dynamic supervisor states sets in the mask are restored from the - * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of - * dynamic supervisor feature). Besides the dynamic supervisor states, the + * Only the independent supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of + * independent supervisor feature). Besides the independent supervisor states, the * legacy region and XSAVE header are also restored from the xsave area. The * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. * * The xsave area must be 64-bytes aligned. */ -void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask) { - u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u64 independent_mask = xfeatures_mask_independent() & mask; u32 lmask, hmask; int err; if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) return; - if (WARN_ON_FPU(!dynamic_mask)) + if (WARN_ON_FPU(!independent_mask)) return; - lmask = dynamic_mask; - hmask = dynamic_mask >> 32; + lmask = independent_mask; + hmask = independent_mask >> 32; XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); -- cgit v1.2.3 From a75c52896b6d42d6600db4d4dd9f7e4bde9218db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:04 +0200 Subject: x86/fpu/xstate: Sanitize handling of independent features The copy functions for the independent features are horribly named and the supervisor and independent part is just overengineered. The point is that the supplied mask has either to be a subset of the independent features or a subset of the task->fpu.xstate managed features. Rewrite it so it checks for invalid overlaps of these areas in the caller supplied feature mask. Rename it so it follows the new naming convention for these operations. Mop up the function documentation. This allows to use that function for other purposes as well. Suggested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Tested-by: Kan Liang Link: https://lkml.kernel.org/r/20210623121455.004880675@linutronix.de --- arch/x86/events/intel/lbr.c | 6 +-- arch/x86/include/asm/fpu/xstate.h | 5 +- arch/x86/kernel/fpu/xstate.c | 98 +++++++++++++++++++-------------------- 3 files changed, 54 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 2fa2df3d4e9b..f338645071c8 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -491,7 +491,7 @@ static void intel_pmu_arch_lbr_xrstors(void *ctx) { struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; - copy_kernel_to_independent_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); + xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR); } static __always_inline bool lbr_is_reset_in_cstate(void *ctx) @@ -576,7 +576,7 @@ static void intel_pmu_arch_lbr_xsaves(void *ctx) { struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; - copy_independent_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); + xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR); } static void __intel_pmu_lbr_save(void *ctx) @@ -992,7 +992,7 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) intel_pmu_store_lbr(cpuc, NULL); return; } - copy_independent_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); + xsaves(&xsave->xsave, XFEATURE_MASK_LBR); intel_pmu_store_lbr(cpuc, xsave->lbr.entries); } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index a55bd5cabb59..7de2384628e2 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -104,8 +104,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); int xfeature_size(int xfeature_nr); int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); -void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); -void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask); + +void xsaves(struct xregs_state *xsave, u64 mask); +void xrstors(struct xregs_state *xsave, u64 mask); enum xstate_copy_mode { XSTATE_COPY_FP, diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 27246a83ecb9..735d44c3efb6 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1162,76 +1162,74 @@ int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, return copy_uabi_to_xstate(xsave, NULL, ubuf); } +static bool validate_xsaves_xrstors(u64 mask) +{ + u64 xchk; + + if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) + return false; + /* + * Validate that this is either a task->fpstate related component + * subset or an independent one. + */ + if (mask & xfeatures_mask_independent()) + xchk = ~xfeatures_mask_independent(); + else + xchk = ~xfeatures_mask_all; + + if (WARN_ON_ONCE(!mask || mask & xchk)) + return false; + + return true; +} + /** - * copy_independent_supervisor_to_kernel() - Save independent supervisor states to - * an xsave area - * @xstate: A pointer to an xsave area - * @mask: Represent the independent supervisor features saved into the xsave area + * xsaves - Save selected components to a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to save * - * Only the independent supervisor states sets in the mask are saved into the xsave - * area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of independent - * supervisor feature). Besides the independent supervisor states, the legacy - * region and XSAVE header are also saved into the xsave area. The supervisor - * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and - * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * The @xstate buffer must be 64 byte aligned and correctly initialized as + * XSAVES does not write the full xstate header. Before first use the + * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer + * can #GP. * - * The xsave area must be 64-bytes aligned. + * The feature mask must either be a subset of the independent features or + * a subset of the task->fpstate related features. */ -void copy_independent_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +void xsaves(struct xregs_state *xstate, u64 mask) { - u64 independent_mask = xfeatures_mask_independent() & mask; - u32 lmask, hmask; int err; - if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + if (!validate_xsaves_xrstors(mask)) return; - if (WARN_ON_FPU(!independent_mask)) - return; - - lmask = independent_mask; - hmask = independent_mask >> 32; - - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - - /* Should never fault when copying to a kernel buffer */ - WARN_ON_FPU(err); + XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); } /** - * copy_kernel_to_independent_supervisor() - Restore independent supervisor states from - * an xsave area - * @xstate: A pointer to an xsave area - * @mask: Represent the independent supervisor features restored from the xsave area + * xrstors - Restore selected components from a kernel xstate buffer + * @xstate: Pointer to the buffer + * @mask: Feature mask to select the components to restore + * + * The @xstate buffer must be 64 byte aligned and correctly initialized + * otherwise XRSTORS from that buffer can #GP. * - * Only the independent supervisor states sets in the mask are restored from the - * xsave area (See the comment in XFEATURE_MASK_INDEPENDENT for the details of - * independent supervisor feature). Besides the independent supervisor states, the - * legacy region and XSAVE header are also restored from the xsave area. The - * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and - * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * Proper usage is to restore the state which was saved with + * xsaves() into @xstate. * - * The xsave area must be 64-bytes aligned. + * The feature mask must either be a subset of the independent features or + * a subset of the task->fpstate related features. */ -void copy_kernel_to_independent_supervisor(struct xregs_state *xstate, u64 mask) +void xrstors(struct xregs_state *xstate, u64 mask) { - u64 independent_mask = xfeatures_mask_independent() & mask; - u32 lmask, hmask; int err; - if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + if (!validate_xsaves_xrstors(mask)) return; - if (WARN_ON_FPU(!independent_mask)) - return; - - lmask = independent_mask; - hmask = independent_mask >> 32; - - XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); - - /* Should never fault when copying from a kernel buffer */ - WARN_ON_FPU(err); + XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); + WARN_ON_ONCE(err); } #ifdef CONFIG_PROC_PID_ARCH_STATUS -- cgit v1.2.3 From 784a46618f634973a17535b7d3d03cd4ebc0ccbd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 23 Jun 2021 14:02:05 +0200 Subject: x86/pkeys: Move read_pkru() and write_pkru() write_pkru() was originally used just to write to the PKRU register. It was mercifully short and sweet and was not out of place in pgtable.h with some other pkey-related code. But, later work included a requirement to also modify the task XSAVE buffer when updating the register. This really is more related to the XSAVE architecture than to paging. Move the read/write_pkru() to asm/pkru.h. pgtable.h won't miss them. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.102647114@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/include/asm/pgtable.h | 57 +----------------------------------- arch/x86/include/asm/pkru.h | 61 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 1 + arch/x86/kvm/svm/sev.c | 1 + arch/x86/kvm/x86.c | 1 + arch/x86/mm/pkeys.c | 1 + 7 files changed, 67 insertions(+), 56 deletions(-) create mode 100644 arch/x86/include/asm/pkru.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7de2384628e2..5764cbe39014 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -6,6 +6,7 @@ #include #include +#include #include /* Bit 63 of XCR0 is reserved for future expansion */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b1099f2d9800..ec0d8e106646 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -23,7 +23,7 @@ #ifndef __ASSEMBLY__ #include -#include +#include #include #include @@ -126,35 +126,6 @@ static inline int pte_dirty(pte_t pte) return pte_flags(pte) & _PAGE_DIRTY; } - -static inline u32 read_pkru(void) -{ - if (boot_cpu_has(X86_FEATURE_OSPKE)) - return rdpkru(); - return 0; -} - -static inline void write_pkru(u32 pkru) -{ - struct pkru_state *pk; - - if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return; - - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); - - /* - * The PKRU value in xstate needs to be in sync with the value that is - * written to the CPU. The FPU restore on return to userland would - * otherwise load the previous value again. - */ - fpregs_lock(); - if (pk) - pk->pkru = pkru; - __write_pkru(pkru); - fpregs_unlock(); -} - static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; @@ -1360,32 +1331,6 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd) } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ -#define PKRU_AD_BIT 0x1 -#define PKRU_WD_BIT 0x2 -#define PKRU_BITS_PER_PKEY 2 - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -extern u32 init_pkru_value; -#else -#define init_pkru_value 0 -#endif - -static inline bool __pkru_allows_read(u32 pkru, u16 pkey) -{ - int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; - return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); -} - -static inline bool __pkru_allows_write(u32 pkru, u16 pkey) -{ - int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; - /* - * Access-disable disables writes too so we need to check - * both bits here. - */ - return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); -} - static inline u16 pte_flags_pkey(unsigned long pte_flags) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h new file mode 100644 index 000000000000..35ffcfd6e403 --- /dev/null +++ b/arch/x86/include/asm/pkru.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_PKRU_H +#define _ASM_X86_PKRU_H + +#include + +#define PKRU_AD_BIT 0x1 +#define PKRU_WD_BIT 0x2 +#define PKRU_BITS_PER_PKEY 2 + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +extern u32 init_pkru_value; +#else +#define init_pkru_value 0 +#endif + +static inline bool __pkru_allows_read(u32 pkru, u16 pkey) +{ + int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; + return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits)); +} + +static inline bool __pkru_allows_write(u32 pkru, u16 pkey) +{ + int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; + /* + * Access-disable disables writes too so we need to check + * both bits here. + */ + return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits)); +} + +static inline u32 read_pkru(void) +{ + if (boot_cpu_has(X86_FEATURE_OSPKE)) + return rdpkru(); + return 0; +} + +static inline void write_pkru(u32 pkru) +{ + struct pkru_state *pk; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); + + /* + * The PKRU value in xstate needs to be in sync with the value that is + * written to the CPU. The FPU restore on return to userland would + * otherwise load the previous value again. + */ + fpregs_lock(); + if (pk) + pk->pkru = pkru; + __write_pkru(pkru); + fpregs_unlock(); +} + +#endif diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d08307df69ad..4651ab08e6e1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8d36f0c73071..62926f1a5f7b 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -19,6 +19,7 @@ #include #include +#include #include #include "x86.h" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f651b9d774c..07f788872ddf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include /* Ugh! */ #include diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index a840ac78b6de..a02cfcf02d0d 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -10,6 +10,7 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ +#include /* read/write_pkru() */ int __execute_only_pkey(struct mm_struct *mm) { -- cgit v1.2.3 From b2681e791dbcee6acb1dca7a5076a0285109ac4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:06 +0200 Subject: x86/fpu: Rename and sanitize fpu__save/copy() Both function names are a misnomer. fpu__save() is actually about synchronizing the hardware register state into the task's memory state so that either coredump or a math exception handler can inspect the state at the time where the problem happens. The function guarantees to preserve the register state, while "save" is a common terminology for saving the current state so it can be modified and restored later. This is clearly not the case here. Rename it to fpu_sync_fpstate(). fpu__copy() is used to clone the current task's FPU state when duplicating task_struct. While the register state is a copy the rest of the FPU state is not. Name it accordingly and remove the really pointless @src argument along with the warning which comes along with it. Nothing can ever copy the FPU state of a non-current task. It's clearly just a consequence of arch_dup_task_struct(), but it makes no sense to proliferate that further. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.196727450@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 6 ++++-- arch/x86/kernel/fpu/core.c | 17 ++++++++--------- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/process.c | 3 +-- arch/x86/kernel/traps.c | 5 +++-- 5 files changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 3558cd05c62f..f5da2e9b87da 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -26,14 +26,16 @@ /* * High level FPU state handling functions: */ -extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); -extern int fpu__copy(struct task_struct *dst, struct task_struct *src); extern void fpu__clear_user_states(struct fpu *fpu); extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); +extern void fpu_sync_fpstate(struct fpu *fpu); + +extern int fpu_clone(struct task_struct *dst); + /* * Boot time FPU initialization functions: */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4a59e0fbcfd8..8762b1a8966a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -159,11 +159,10 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * Save the FPU state (mark it for reload if necessary): - * - * This only ever gets called for the current task. + * Sync the FPU register state to current's memory register state when the + * current task owns the FPU. The hardware register state is preserved. */ -void fpu__save(struct fpu *fpu) +void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); @@ -221,18 +220,18 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -int fpu__copy(struct task_struct *dst, struct task_struct *src) +/* Clone current's FPU state on fork */ +int fpu_clone(struct task_struct *dst) { + struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; - struct fpu *src_fpu = &src->thread.fpu; + /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; - if (!static_cpu_has(X86_FEATURE_FPU)) + if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; - WARN_ON_FPU(src_fpu != ¤t->thread.fpu); - /* * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 892aec1dd822..4575796d547b 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -41,7 +41,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r static void sync_fpstate(struct fpu *fpu) { if (fpu == ¤t->thread.fpu) - fpu__save(fpu); + fpu_sync_fpstate(fpu); } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e1f38179f49..af3db530983b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,8 +87,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - - return fpu__copy(dst, src); + return fpu_clone(dst); } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 853ea7a80806..4c9c4aa83216 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1046,9 +1046,10 @@ static void math_error(struct pt_regs *regs, int trapnr) } /* - * Save the info for the exception handler and clear the error. + * Synchronize the FPU register state to the memory register state + * if necessary. This allows the exception handler to inspect it. */ - fpu__save(fpu); + fpu_sync_fpstate(fpu); task->thread.trap_nr = trapnr; task->thread.error_code = 0; -- cgit v1.2.3 From 8a1dc55a3f3ef0a723c3c117a567e7b5dd2c1793 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:07 +0200 Subject: x86/cpu: Sanitize X86_FEATURE_OSPKE X86_FEATURE_OSPKE is enabled first on the boot CPU and the feature flag is set. Secondary CPUs have to enable CR4.PKE as well and set their per CPU feature flag. That's ineffective because all call sites have checks for boot_cpu_data. Make it smarter and force the feature flag when PKU is enabled on the boot cpu which allows then to use cpu_feature_enabled(X86_FEATURE_OSPKE) all over the place. That either compiles the code out when PKEY support is disabled in Kconfig or uses a static_cpu_has() for the feature check which makes a significant difference in hotpaths, e.g. context switch. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.305113644@linutronix.de --- arch/x86/include/asm/pkeys.h | 8 ++++---- arch/x86/include/asm/pkru.h | 4 ++-- arch/x86/kernel/cpu/common.c | 24 +++++++++++------------- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/xstate.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/mm/fault.c | 2 +- 7 files changed, 21 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 2ff9b98812b7..4128f647c755 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -9,14 +9,14 @@ * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ -#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) +#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { - return boot_cpu_has(X86_FEATURE_OSPKE); + return cpu_feature_enabled(X86_FEATURE_OSPKE); } /* @@ -26,7 +26,7 @@ static inline bool arch_pkeys_enabled(void) extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); @@ -37,7 +37,7 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index 35ffcfd6e403..ec8dd2878dc9 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -32,7 +32,7 @@ static inline bool __pkru_allows_write(u32 pkru, u16 pkey) static inline u32 read_pkru(void) { - if (boot_cpu_has(X86_FEATURE_OSPKE)) + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) return rdpkru(); return 0; } @@ -41,7 +41,7 @@ static inline void write_pkru(u32 pkru) { struct pkru_state *pk; - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f875dea49d7e..dbfb335ffac4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -466,22 +466,20 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { - /* check the boot processor, plus compile options for PKU: */ - if (!cpu_feature_enabled(X86_FEATURE_PKU)) - return; - /* checks the actual processor's cpuid bits: */ - if (!cpu_has(c, X86_FEATURE_PKU)) - return; - if (pku_disabled) + if (c == &boot_cpu_data) { + if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* + * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid + * bit to be set. Enforce it. + */ + setup_force_cpu_cap(X86_FEATURE_OSPKE); + + } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) { return; + } cr4_set_bits(X86_CR4_PKE); - /* - * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE - * cpuid bit to be set. We need to ensure that we - * update that bit in this CPU's "cpu_info". - */ - set_cpu_cap(c, X86_FEATURE_OSPKE); } #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8762b1a8966a..3866954354a4 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -311,7 +311,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) else frstor(&init_fpstate.fsave); - if (boot_cpu_has(X86_FEATURE_OSPKE)) + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 735d44c3efb6..bc71609ba65a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -921,7 +921,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4651ab08e6e1..40a963809203 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -137,7 +137,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, log_lvl, d3, d6, d7); } - if (boot_cpu_has(X86_FEATURE_OSPKE)) + if (cpu_feature_enabled(X86_FEATURE_OSPKE)) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6bda7f67d737..f33a61a432ce 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -875,7 +875,7 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code, /* This code is always called on the current mm */ bool foreign = false; - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return false; if (error_code & X86_PF_PK) return true; -- cgit v1.2.3 From 739e2eec0f4849eb411567407d61491f923db405 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:08 +0200 Subject: x86/pkru: Provide pkru_get_init_value() When CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS is disabled then the following code fails to compile: if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { u32 pkru = READ_ONCE(init_pkru_value); .. } because init_pkru_value is defined as '0' which makes READ_ONCE() upset. Provide an accessor macro to avoid #ifdeffery all over the place. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.404880646@linutronix.de --- arch/x86/include/asm/pkru.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index ec8dd2878dc9..19d3d7b98465 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -10,8 +10,10 @@ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS extern u32 init_pkru_value; +#define pkru_get_init_value() READ_ONCE(init_pkru_value) #else #define init_pkru_value 0 +#define pkru_get_init_value() 0 #endif static inline bool __pkru_allows_read(u32 pkru, u16 pkey) -- cgit v1.2.3 From ff7ebff47c595e747aa1bb10d8a30b2acb7d425b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:09 +0200 Subject: x86/pkru: Provide pkru_write_default() Provide a simple and trivial helper which just writes the PKRU default value without trying to fiddle with the task's xsave buffer. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.513729794@linutronix.de --- arch/x86/include/asm/pkru.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index 19d3d7b98465..7e4550911c51 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -60,4 +60,12 @@ static inline void write_pkru(u32 pkru) fpregs_unlock(); } +static inline void pkru_write_default(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + wrpkru(pkru_get_init_value()); +} + #endif -- cgit v1.2.3 From fa8c84b77a54bf3cf351c8b4b26a5aca27a14013 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:10 +0200 Subject: x86/cpu: Write the default PKRU value when enabling PKE In preparation of making the PKRU management more independent from XSTATES, write the default PKRU value into the hardware right after enabling PKRU in CR4. This ensures that switch_to() and copy_thread() have the correct setting for init task and the per CPU idle threads right away. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.622983906@linutronix.de --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dbfb335ffac4..ca668efa4c81 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -480,6 +480,8 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) } cr4_set_bits(X86_CR4_PKE); + /* Load the default PKRU value */ + pkru_write_default(); } #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -- cgit v1.2.3 From 371071131cd1032c1e9172c51234a2a324841cab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:11 +0200 Subject: x86/fpu: Use pkru_write_default() in copy_init_fpstate_to_fpregs() There is no point in using copy_init_pkru_to_fpregs() which in turn calls write_pkru(). write_pkru() tries to fiddle with the task's xstate buffer for nothing because the XRSTOR[S](init_fpstate) just cleared the xfeature flag in the xstate header which makes get_xsave_addr() fail. It's a useless exercise anyway because the reinitialization activates the FPU so before the task's xstate buffer can be used again a XRSTOR[S] must happen which in turn dumps the PKRU value. Get rid of the now unused copy_init_pkru_to_fpregs(). Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.732508792@linutronix.de --- arch/x86/include/asm/pkeys.h | 1 - arch/x86/kernel/fpu/core.c | 3 +-- arch/x86/mm/pkeys.c | 17 ----------------- include/linux/pkeys.h | 4 ---- 4 files changed, 1 insertion(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 4128f647c755..5c7bcaa79623 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -124,7 +124,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); -extern void copy_init_pkru_to_fpregs(void); static inline int vma_pkey(struct vm_area_struct *vma) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3866954354a4..fedadcb04ba2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -311,8 +311,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) else frstor(&init_fpstate.fsave); - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) - copy_init_pkru_to_fpregs(); + pkru_write_default(); } /* diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index a02cfcf02d0d..fb171a5d7f33 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -10,7 +10,6 @@ #include /* boot_cpu_has, ... */ #include /* vma_pkey() */ -#include /* read/write_pkru() */ int __execute_only_pkey(struct mm_struct *mm) { @@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) | PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15); -/* - * Called from the FPU code when creating a fresh set of FPU - * registers. This is called from a very specific context where - * we know the FPU registers are safe for use and we can use PKRU - * directly. - */ -void copy_init_pkru_to_fpregs(void) -{ - u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); - /* - * Override the PKRU state that came from 'init_fpstate' - * with the baseline from the process. - */ - write_pkru(init_pkru_value_snapshot); -} - static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 2955ba976048..6beb26b7151d 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -44,10 +44,6 @@ static inline bool arch_pkeys_enabled(void) return false; } -static inline void copy_init_pkru_to_fpregs(void) -{ -} - #endif /* ! CONFIG_ARCH_HAS_PKEYS */ #endif /* _LINUX_PKEYS_H */ -- cgit v1.2.3 From e7ecad17c84d0f6bef635c20d02bbe4096eea700 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:12 +0200 Subject: x86/fpu: Rename fpu__clear_all() to fpu_flush_thread() Make it clear what the function is about. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.827979263@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 3 ++- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/process.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index f5da2e9b87da..dabbb700c0eb 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -29,12 +29,13 @@ extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern void fpu__clear_user_states(struct fpu *fpu); -extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); +/* Clone and exit operations */ extern int fpu_clone(struct task_struct *dst); +extern void fpu_flush_thread(void); /* * Boot time FPU initialization functions: diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index fedadcb04ba2..4b69be9bea55 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -350,9 +350,9 @@ void fpu__clear_user_states(struct fpu *fpu) fpu__clear(fpu, true); } -void fpu__clear_all(struct fpu *fpu) +void fpu_flush_thread(void) { - fpu__clear(fpu, false); + fpu__clear(¤t->thread.fpu, false); } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index af3db530983b..19d05d39a81d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -205,7 +205,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear_all(&tsk->thread.fpu); + fpu_flush_thread(); } void disable_TSC(void) -- cgit v1.2.3 From 33344368cb08f8d6bf55a32aa052318d3a69ea84 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 23 Jun 2021 14:02:13 +0200 Subject: x86/fpu: Clean up the fpu__clear() variants fpu__clear() currently resets both register state and kernel XSAVE buffer state. It has two modes: one for all state (supervisor and user) and another for user state only. fpu__clear_all() uses the "all state" (user_only=0) mode, while a number of signal paths use the user_only=1 mode. Make fpu__clear() work only for user state (user_only=1) and remove the "all state" (user_only=0) code. Rename it to match so it can be used by the signal paths. Replace the "all state" (user_only=0) fpu__clear() functionality. Use the TIF_NEED_FPU_LOAD functionality instead of making any actual hardware registers changes in this path. Instead of invoking fpu__initialize() just memcpy() init_fpstate into the task's FPU state because that has already the correct format and in case of PKRU also contains the default PKRU value. Move the actual PKRU write out into flush_thread() where it belongs and where it will end up anyway when PKRU and XSTATE have been untangled. For bisectability a workaround is required which stores the PKRU value in the xstate memory until PKRU is untangled from XSTATE for context switching and return to user. [ Dave Hansen: Polished changelog ] [ tglx: Fixed the PKRU fallout ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121455.922729522@linutronix.de --- arch/x86/kernel/fpu/core.c | 113 ++++++++++++++++++++++++++++++--------------- arch/x86/kernel/process.c | 10 ++++ 2 files changed, 86 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4b69be9bea55..aa7e808b9d1e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -259,19 +259,6 @@ int fpu_clone(struct task_struct *dst) return 0; } -/* - * Activate the current task's in-memory FPU context, - * if it has not been used before: - */ -static void fpu__initialize(struct fpu *fpu) -{ - WARN_ON_FPU(fpu != ¤t->thread.fpu); - - set_thread_flag(TIF_NEED_FPU_LOAD); - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); -} - /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents @@ -314,47 +301,99 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) pkru_write_default(); } +static inline unsigned int init_fpstate_copy_size(void) +{ + if (!use_xsave()) + return fpu_kernel_xstate_size; + + /* XSAVE(S) just needs the legacy and the xstate header part */ + return sizeof(init_fpstate.xsave); +} + +/* Temporary workaround. Will be removed once PKRU and XSTATE are untangled. */ +static inline void pkru_set_default_in_xstate(struct xregs_state *xsave) +{ + struct pkru_state *pk; + + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + /* + * Force XFEATURE_PKRU to be set in the header otherwise + * get_xsave_addr() does not work and it also needs to be set to + * make XRSTOR(S) load it. + */ + xsave->header.xfeatures |= XFEATURE_MASK_PKRU; + pk = get_xsave_addr(xsave, XFEATURE_PKRU); + pk->pkru = pkru_get_init_value(); +} + /* - * Clear the FPU state back to init state. - * - * Called by sys_execve(), by the signal handler code and by various - * error paths. + * Reset current->fpu memory state to the init values. + */ +static void fpu_reset_fpstate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_lock(); + fpu__drop(fpu); + /* + * This does not change the actual hardware registers. It just + * resets the memory image and sets TIF_NEED_FPU_LOAD so a + * subsequent return to usermode will reload the registers from the + * task's memory image. + * + * Do not use fpstate_init() here. Just copy init_fpstate which has + * the correct content already except for PKRU. + */ + memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); + pkru_set_default_in_xstate(&fpu->state.xsave); + set_thread_flag(TIF_NEED_FPU_LOAD); + fpregs_unlock(); +} + +/* + * Reset current's user FPU states to the init states. current's + * supervisor states, if any, are not modified by this function. The + * caller guarantees that the XSTATE header in memory is intact. */ -static void fpu__clear(struct fpu *fpu, bool user_only) +void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) { - fpu__drop(fpu); - fpu__initialize(fpu); + fpregs_lock(); + if (!cpu_feature_enabled(X86_FEATURE_FPU)) { + fpu_reset_fpstate(); + fpregs_unlock(); return; } - fpregs_lock(); - - if (user_only) { - if (!fpregs_state_valid(fpu, smp_processor_id()) && - xfeatures_mask_supervisor()) - os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); - restore_fpregs_from_init_fpstate(xfeatures_mask_user()); - } else { - restore_fpregs_from_init_fpstate(xfeatures_mask_all); + /* + * Ensure that current's supervisor states are loaded into their + * corresponding registers. + */ + if (xfeatures_mask_supervisor() && + !fpregs_state_valid(fpu, smp_processor_id())) { + os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); } + /* Reset user states in registers. */ + restore_fpregs_from_init_fpstate(xfeatures_mask_user()); + + /* + * Now all FPU registers have their desired values. Inform the FPU + * state machine that current's FPU registers are in the hardware + * registers. The memory image does not need to be updated because + * any operation relying on it has to save the registers first when + * current's FPU is marked active. + */ fpregs_mark_activate(); fpregs_unlock(); } -void fpu__clear_user_states(struct fpu *fpu) -{ - fpu__clear(fpu, true); -} - void fpu_flush_thread(void) { - fpu__clear(¤t->thread.fpu, false); + fpu_reset_fpstate(); } - /* * Load FPU context before returning to userspace. */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 19d05d39a81d..de942b038815 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -198,6 +198,15 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, return ret; } +static void pkru_flush_thread(void) +{ + /* + * If PKRU is enabled the default PKRU value has to be loaded into + * the hardware right here (similar to context switch). + */ + pkru_write_default(); +} + void flush_thread(void) { struct task_struct *tsk = current; @@ -206,6 +215,7 @@ void flush_thread(void) memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); fpu_flush_thread(); + pkru_flush_thread(); } void disable_TSC(void) -- cgit v1.2.3 From 727d01100e15b18c67f05fb697779ad2a6c99b63 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:14 +0200 Subject: x86/fpu: Rename __fpregs_load_activate() to fpregs_restore_userregs() Rename it so that it becomes entirely clear what this function is about. It's purpose is to restore the FPU registers to the state which was saved in the task's FPU memory state either at context switch or by an in kernel FPU user. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.018867925@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 6 ++---- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index dabbb700c0eb..f7688f6e86bc 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -465,10 +465,8 @@ static inline void fpregs_activate(struct fpu *fpu) trace_x86_fpu_regs_activated(fpu); } -/* - * Internal helper, do not use directly. Use switch_fpu_return() instead. - */ -static inline void __fpregs_load_activate(void) +/* Internal helper for switch_fpu_return() and signal frame setup */ +static inline void fpregs_restore_userregs(void) { struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index aa7e808b9d1e..6babf1876389 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -402,7 +402,7 @@ void switch_fpu_return(void) if (!static_cpu_has(X86_FEATURE_FPU)) return; - __fpregs_load_activate(); + fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index fd4b58d7b72e..b12665c38a30 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -188,7 +188,7 @@ retry: */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) - __fpregs_load_activate(); + fpregs_restore_userregs(); pagefault_disable(); ret = copy_fpregs_to_sigframe(buf_fx); -- cgit v1.2.3 From 1d9bffab116fadfe1594f5fea2b50ab280d81d30 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:15 +0200 Subject: x86/fpu: Move FXSAVE_LEAK quirk info __copy_kernel_to_fpregs() copy_kernel_to_fpregs() restores all xfeatures but it is also the place where the AMD FXSAVE_LEAK bug is handled. That prevents fpregs_restore_userregs() to limit the restored features, which is required to untangle PKRU and XSTATE handling and also for the upcoming supervisor state management. Move the FXSAVE_LEAK quirk into __copy_kernel_to_fpregs() and deinline that function which has become rather fat. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.114271278@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 25 +------------------------ arch/x86/kernel/fpu/core.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index f7688f6e86bc..63d979656437 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -379,33 +379,10 @@ static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask) return err; } -static inline void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) -{ - if (use_xsave()) { - os_xrstor(&fpstate->xsave, mask); - } else { - if (use_fxsr()) - fxrstor(&fpstate->fxsave); - else - frstor(&fpstate->fsave); - } -} +extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask); static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) { - /* - * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is - * pending. Clear the x87 state here by setting it to fixed values. - * "m" is a random variable that should be in L1. - */ - if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { - asm volatile( - "fnclex\n\t" - "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpstate)); - } - __restore_fpregs_from_fpstate(fpstate, -1); } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 6babf1876389..afd0deee8cfd 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -124,6 +124,33 @@ void save_fpregs_to_fpstate(struct fpu *fpu) } EXPORT_SYMBOL(save_fpregs_to_fpstate); +void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) +{ + /* + * AMD K7/K8 and later CPUs up to Zen don't save/restore + * FDP/FIP/FOP unless an exception is pending. Clear the x87 state + * here by setting it to fixed values. "m" is a random variable + * that should be in L1. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { + asm volatile( + "fnclex\n\t" + "emms\n\t" + "fildl %P[addr]" /* set F?P to defined value */ + : : [addr] "m" (fpstate)); + } + + if (use_xsave()) { + os_xrstor(&fpstate->xsave, mask); + } else { + if (use_fxsr()) + fxrstor(&fpstate->fxsave); + else + frstor(&fpstate->fsave); + } +} +EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate); + void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); -- cgit v1.2.3 From 65e952102122bf89f0e4f1bebf8664e32587aaed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:16 +0200 Subject: x86/fpu: Rename xfeatures_mask_user() to xfeatures_mask_uabi() Rename it so it's clear that this is about user ABI features which can differ from the feature set which the kernel saves and restores because the kernel handles e.g. PKRU differently. But the user ABI (ptrace, signal frame) expects it to be there. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.211585137@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 7 ++++++- arch/x86/include/asm/fpu/xstate.h | 6 +++++- arch/x86/kernel/fpu/core.c | 2 +- arch/x86/kernel/fpu/signal.c | 10 +++++----- arch/x86/kernel/fpu/xstate.c | 18 +++++++++--------- 5 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 63d979656437..bcfb6365a76b 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -324,7 +324,12 @@ static inline void os_xrstor(struct xregs_state *xstate, u64 mask) */ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) { - u64 mask = xfeatures_mask_user(); + /* + * Include the features which are not xsaved/rstored by the kernel + * internally, e.g. PKRU. That's user space ABI and also required + * to allow the signal handler to modify PKRU. + */ + u64 mask = xfeatures_mask_uabi(); u32 lmask = mask; u32 hmask = mask >> 32; int err; diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 5764cbe39014..af9ea134bf5e 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -83,7 +83,11 @@ static inline u64 xfeatures_mask_supervisor(void) return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; } -static inline u64 xfeatures_mask_user(void) +/* + * The xfeatures which are enabled in XCR0 and expected to be in ptrace + * buffers and signal frames. + */ +static inline u64 xfeatures_mask_uabi(void) { return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index afd0deee8cfd..12437383ff79 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -404,7 +404,7 @@ void fpu__clear_user_states(struct fpu *fpu) } /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_user()); + restore_fpregs_from_init_fpstate(xfeatures_mask_uabi()); /* * Now all FPU registers have their desired values. Inform the FPU diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b12665c38a30..a42bc9d0b1cc 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -257,14 +257,14 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) if (use_xsave()) { if (fx_only) { - init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE; r = fxrstor_from_user_sigframe(buf); if (!r) os_xrstor(&init_fpstate.xsave, init_bv); return r; } else { - init_bv = xfeatures_mask_user() & ~xbv; + init_bv = xfeatures_mask_uabi() & ~xbv; r = xrstor_from_user_sigframe(buf, xbv); if (!r && unlikely(init_bv)) @@ -420,7 +420,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_unlock(); if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; + u64 init_bv = xfeatures_mask_uabi() & ~user_xfeatures; ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) @@ -454,7 +454,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave()) { u64 init_bv; - init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE; os_xrstor(&init_fpstate.xsave, init_bv); } @@ -549,7 +549,7 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask_user(); + fx_sw_reserved.xfeatures = xfeatures_mask_uabi(); fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index bc71609ba65a..c513596f8ec9 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -144,7 +144,7 @@ void fpu__init_cpu_xstate(void) * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. @@ -453,7 +453,7 @@ int xfeature_size(int xfeature_nr) static int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~xfeatures_mask_user()) + if (hdr->xfeatures & ~xfeatures_mask_uabi()) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -756,7 +756,7 @@ void __init fpu__init_system_xstate(void) cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); xfeatures_mask_all |= ecx + ((u64)edx << 32); - if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue @@ -791,7 +791,7 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -828,14 +828,14 @@ void fpu__resume_cpu(void) /* * Restore XCR0 on xsave capable CPUs: */ - if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi()); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } @@ -993,7 +993,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= xfeatures_mask_user(); + header.xfeatures &= xfeatures_mask_uabi(); break; } @@ -1038,7 +1038,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, * compacted init_fpstate. The gap tracking will zero this * later. */ - if (!(xfeatures_mask_user() & BIT_ULL(i))) + if (!(xfeatures_mask_uabi() & BIT_ULL(i))) continue; /* -- cgit v1.2.3 From 2ebe81c6d800576e1213f9d7cf0068017ae610c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:17 +0200 Subject: x86/fpu: Dont restore PKRU in fpregs_restore_userspace() switch_to() and flush_thread() write the task's PKRU value eagerly so the PKRU value of current is always valid in the hardware. That means there is no point in restoring PKRU on exit to user or when reactivating the task's FPU registers in the signal frame setup path. This allows to remove all the xstate buffer updates with PKRU values once the PKRU state is stored in thread struct while a task is scheduled out. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.303919033@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 16 +++++++++++++++- arch/x86/include/asm/fpu/xstate.h | 19 +++++++++++++++++++ arch/x86/kernel/fpu/core.c | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index bcfb6365a76b..521774320e6a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -457,7 +457,21 @@ static inline void fpregs_restore_userregs(void) return; if (!fpregs_state_valid(fpu, cpu)) { - restore_fpregs_from_fpstate(&fpu->state); + u64 mask; + + /* + * This restores _all_ xstate which has not been + * established yet. + * + * If PKRU is enabled, then the PKRU value is already + * correct because it was either set in switch_to() or in + * flush_thread(). So it is excluded because it might be + * not up to date in current->thread.fpu.xsave state. + */ + mask = xfeatures_mask_restore_user() | + xfeatures_mask_supervisor(); + __restore_fpregs_from_fpstate(&fpu->state, mask); + fpregs_activate(fpu); fpu->last_cpu = cpu; } diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index af9ea134bf5e..6a0aaafb93ec 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -35,6 +35,14 @@ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR) +/* + * Features which are restored when returning to user space. + * PKRU is not restored on return to user space because PKRU + * is switched eagerly in switch_to() and flush_thread() + */ +#define XFEATURE_MASK_USER_RESTORE \ + (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) + /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID) @@ -92,6 +100,17 @@ static inline u64 xfeatures_mask_uabi(void) return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; } +/* + * The xfeatures which are restored by the kernel when returning to user + * mode. This is not necessarily the same as xfeatures_mask_uabi() as the + * kernel does not manage all XCR0 enabled features via xsave/xrstor as + * some of them have to be switched eagerly on context switch and exec(). + */ +static inline u64 xfeatures_mask_restore_user(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE; +} + static inline u64 xfeatures_mask_independent(void) { if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12437383ff79..470576ced907 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -404,7 +404,7 @@ void fpu__clear_user_states(struct fpu *fpu) } /* Reset user states in registers. */ - restore_fpregs_from_init_fpstate(xfeatures_mask_uabi()); + restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user()); /* * Now all FPU registers have their desired values. Inform the FPU -- cgit v1.2.3 From 9782a712eb971ce483442076e79eb1d8d608646e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 23 Jun 2021 14:02:18 +0200 Subject: x86/fpu: Add PKRU storage outside of task XSAVE buffer PKRU is currently partly XSAVE-managed and partly not. It has space in the task XSAVE buffer and is context-switched by XSAVE/XRSTOR. However, it is switched more eagerly than FPU because there may be a need for PKRU to be up-to-date for things like copy_to/from_user() since PKRU affects user-permission memory accesses, not just accesses from userspace itself. This leaves PKRU in a very odd position. XSAVE brings very little value to the table for how Linux uses PKRU except for signal related XSTATE handling. Prepare to move PKRU away from being XSAVE-managed. Allocate space in the thread_struct for it and save/restore it in the context-switch path separately from the XSAVE-managed features. task->thread_struct.pkru is only valid when the task is scheduled out. For the current task the authoritative source is the hardware, i.e. it has to be retrieved via rdpkru(). Leave the XSAVE code in place for now to ensure bisectability. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.399107624@linutronix.de --- arch/x86/include/asm/processor.h | 9 +++++++++ arch/x86/kernel/process.c | 7 +++++++ arch/x86/kernel/process_64.c | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 556b2b17c3e2..91946fc3c006 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -518,6 +518,15 @@ struct thread_struct { unsigned int sig_on_uaccess_err:1; + /* + * Protection Keys Register for Userspace. Loaded immediately on + * context switch. Store it in thread_struct to avoid a lookup in + * the tasks's FPU xstate buffer. This value is only valid when a + * task is scheduled out. For 'current' the authoritative source of + * PKRU is the hardware itself. + */ + u32 pkru; + /* Floating point and extended processor state */ struct fpu fpu; /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index de942b038815..fa6c8fa0f778 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -156,11 +156,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { + p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; } + /* + * Clone current's PKRU value from hardware. tsk->thread.pkru + * is only valid when scheduled out. + */ + p->thread.pkru = read_pkru(); + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 40a963809203..ec0d836a13b1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -340,6 +340,29 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +/* + * Store prev's PKRU value and load next's PKRU value if they differ. PKRU + * is not XSTATE managed on context switch because that would require a + * lookup in the task's FPU xsave buffer and require to keep that updated + * in various places. + */ +static __always_inline void x86_pkru_load(struct thread_struct *prev, + struct thread_struct *next) +{ + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; + + /* Stash the prev task's value: */ + prev->pkru = rdpkru(); + + /* + * PKRU writes are slightly expensive. Avoid them when not + * strictly necessary: + */ + if (prev->pkru != next->pkru) + wrpkru(next->pkru); +} + static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { @@ -589,6 +612,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) x86_fsgsbase_load(prev, next); + x86_pkru_load(prev, next); + /* * Switch the PDA and FPU contexts. */ -- cgit v1.2.3 From e84ba47e313dbc097bf859bb6e4f9219883d5f78 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 23 Jun 2021 14:02:19 +0200 Subject: x86/fpu: Hook up PKRU into ptrace() One nice thing about having PKRU be XSAVE-managed is that it gets naturally exposed into the XSAVE-using ABIs. Now that XSAVE will not be used to manage PKRU, these ABIs need to be manually enabled to deal with PKRU. ptrace() uses copy_uabi_xstate_to_kernel() to collect the tracee's XSTATE. As PKRU is not in the task's XSTATE buffer, use task->thread.pkru for filling in up the ptrace buffer. Signed-off-by: Dave Hansen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.508770763@linutronix.de --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 10 ++++------ arch/x86/kernel/fpu/xstate.c | 25 ++++++++++++++++++------- 3 files changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 6a0aaafb93ec..4ff4a0093a48 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -139,7 +139,7 @@ enum xstate_copy_mode { }; struct membuf; -void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 4575796d547b..66ed317ebc0d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -78,7 +78,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, sizeof(fpu->state.fxsave)); } - copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_FX); + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX); return 0; } @@ -126,14 +126,12 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, int xstateregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; - if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - sync_fpstate(fpu); + sync_fpstate(&target->thread.fpu); - copy_xstate_to_uabi_buf(to, &fpu->state.xsave, XSTATE_COPY_XSAVE); + copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; } @@ -336,7 +334,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) }; /* Handle init state optimized xstate correctly */ - copy_xstate_to_uabi_buf(mb, &fpu->state.xsave, XSTATE_COPY_FP); + copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP); fx = &fxsave; } else { fx = &fpu->state.fxsave; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c513596f8ec9..9fd124a001b0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -962,7 +962,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor - * @xsave: The kernel xstate buffer to copy from + * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming @@ -971,10 +971,11 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * * It supports partial copy but @to.pos always starts from zero. */ -void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, +void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); + struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; struct xregs_state *xinit = &init_fpstate.xsave; struct xstate_header header; unsigned int zerofrom; @@ -1048,11 +1049,21 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct xregs_state *xsave, if (zerofrom < xstate_offsets[i]) membuf_zero(&to, xstate_offsets[i] - zerofrom); - copy_feature(header.xfeatures & BIT_ULL(i), &to, - __raw_xsave_addr(xsave, i), - __raw_xsave_addr(xinit, i), - xstate_sizes[i]); - + if (i == XFEATURE_PKRU) { + struct pkru_state pkru = {0}; + /* + * PKRU is not necessarily up to date in the + * thread's XSAVE buffer. Fill this part from the + * per-thread storage. + */ + pkru.pkru = tsk->thread.pkru; + membuf_write(&to, &pkru, sizeof(pkru)); + } else { + copy_feature(header.xfeatures & BIT_ULL(i), &to, + __raw_xsave_addr(xsave, i), + __raw_xsave_addr(xinit, i), + xstate_sizes[i]); + } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. -- cgit v1.2.3 From 30a304a138738d71a09c730ca8044e9662de0dbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:20 +0200 Subject: x86/fpu: Mask PKRU from kernel XRSTOR[S] operations As the PKRU state is managed separately restoring it from the xstate buffer would be counterproductive as it might either restore a stale value or reinit the PKRU state to 0. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.606745195@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 4 ++-- arch/x86/include/asm/fpu/xstate.h | 10 ++++++++++ arch/x86/kernel/fpu/xstate.c | 1 + arch/x86/mm/extable.c | 2 +- 4 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 521774320e6a..2a484f5f2413 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -259,7 +259,7 @@ static inline void fxsave(struct fxregs_state *fx) */ static inline void os_xrstor_booting(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_fpstate(); u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -388,7 +388,7 @@ extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask) static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate) { - __restore_fpregs_from_fpstate(fpstate, -1); + __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate()); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 4ff4a0093a48..109dfcc75299 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -111,6 +111,16 @@ static inline u64 xfeatures_mask_restore_user(void) return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE; } +/* + * Like xfeatures_mask_restore_user() but additionally restors the + * supported supervisor states. + */ +static inline u64 xfeatures_mask_fpstate(void) +{ + return xfeatures_mask_all & \ + (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED); +} + static inline u64 xfeatures_mask_independent(void) { if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9fd124a001b0..21a10a66c4e4 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -60,6 +60,7 @@ static short xsave_cpuid_features[] __initdata = { * XSAVE buffer, both supervisor and user xstates. */ u64 xfeatures_mask_all __ro_after_init; +EXPORT_SYMBOL_GPL(xfeatures_mask_all); static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 2c5cccd77b8b..e1664e9f969c 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -65,7 +65,7 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.", (void *)instruction_pointer(regs)); - __restore_fpregs_from_fpstate(&init_fpstate, -1); + __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate()); return true; } EXPORT_SYMBOL_GPL(ex_handler_fprestore); -- cgit v1.2.3 From 954436989cc550dd91aab98363240c9c0a4b7e23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:21 +0200 Subject: x86/fpu: Remove PKRU handling from switch_fpu_finish() PKRU is already updated and the xstate is not longer the proper source of information. [ bp: Use cpu_feature_enabled() ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.708180184@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2a484f5f2413..528a86826f95 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -523,39 +523,13 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ /* - * Load PKRU from the FPU context if available. Delay loading of the - * complete FPU state until the return to userland. + * Delay loading of the complete FPU state until the return to userland. + * PKRU is handled separately. */ static inline void switch_fpu_finish(struct fpu *new_fpu) { - u32 pkru_val = init_pkru_value; - struct pkru_state *pk; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return; - - set_thread_flag(TIF_NEED_FPU_LOAD); - - if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) - return; - - /* - * PKRU state is switched eagerly because it needs to be valid before we - * return to userland e.g. for a copy_to_user() operation. - */ - if (!(current->flags & PF_KTHREAD)) { - /* - * If the PKRU bit in xsave.header.xfeatures is not set, - * then the PKRU component was in init state, which means - * XRSTOR will set PKRU to 0. If the bit is not set then - * get_xsave_addr() will return NULL because the PKRU value - * in memory is not valid. This means pkru_val has to be - * set to 0 and not to init_pkru_value. - */ - pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); - pkru_val = pk ? pk->pkru : 0; - } - __write_pkru(pkru_val); + if (cpu_feature_enabled(X86_FEATURE_FPU)) + set_thread_flag(TIF_NEED_FPU_LOAD); } #endif /* _ASM_X86_FPU_INTERNAL_H */ -- cgit v1.2.3 From 0e8c54f6b2c8b1037cef9276e451522ee90ed969 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:22 +0200 Subject: x86/fpu: Don't store PKRU in xstate in fpu_reset_fpstate() PKRU for a task is stored in task->thread.pkru when the task is scheduled out. For 'current' the authoritative source of PKRU is the hardware. fpu_reset_fpstate() has two callers: 1) fpu__clear_user_states() for !FPU systems. For those PKRU is irrelevant 2) fpu_flush_thread() which is invoked from flush_thread(). flush_thread() resets the hardware to the kernel restrictive default value. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.802850233@linutronix.de --- arch/x86/kernel/fpu/core.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 470576ced907..5295cbafc92e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -337,23 +337,6 @@ static inline unsigned int init_fpstate_copy_size(void) return sizeof(init_fpstate.xsave); } -/* Temporary workaround. Will be removed once PKRU and XSTATE are untangled. */ -static inline void pkru_set_default_in_xstate(struct xregs_state *xsave) -{ - struct pkru_state *pk; - - if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) - return; - /* - * Force XFEATURE_PKRU to be set in the header otherwise - * get_xsave_addr() does not work and it also needs to be set to - * make XRSTOR(S) load it. - */ - xsave->header.xfeatures |= XFEATURE_MASK_PKRU; - pk = get_xsave_addr(xsave, XFEATURE_PKRU); - pk->pkru = pkru_get_init_value(); -} - /* * Reset current->fpu memory state to the init values. */ @@ -371,9 +354,12 @@ static void fpu_reset_fpstate(void) * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. + * + * PKRU handling does not rely on the xstate when restoring for + * user space as PKRU is eagerly written in switch_to() and + * flush_thread(). */ memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size()); - pkru_set_default_in_xstate(&fpu->state.xsave); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } -- cgit v1.2.3 From 72a6c08c44e4460e39315ca828f60b8d5afd6b19 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:23 +0200 Subject: x86/pkru: Remove xstate fiddling from write_pkru() The PKRU value of a task is stored in task->thread.pkru when the task is scheduled out. PKRU is restored on schedule in from there. So keeping the XSAVE buffer up to date is a pointless exercise. Remove the xstate fiddling and cleanup all related functions. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.897372712@linutronix.de --- arch/x86/include/asm/pkru.h | 17 ++++------------- arch/x86/include/asm/special_insns.h | 14 +------------- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 7 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h index 7e4550911c51..ccc539faa5bb 100644 --- a/arch/x86/include/asm/pkru.h +++ b/arch/x86/include/asm/pkru.h @@ -41,23 +41,14 @@ static inline u32 read_pkru(void) static inline void write_pkru(u32 pkru) { - struct pkru_state *pk; - if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; - - pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); - /* - * The PKRU value in xstate needs to be in sync with the value that is - * written to the CPU. The FPU restore on return to userland would - * otherwise load the previous value again. + * WRPKRU is relatively expensive compared to RDPKRU. + * Avoid WRPKRU when it would not change the value. */ - fpregs_lock(); - if (pk) - pk->pkru = pkru; - __write_pkru(pkru); - fpregs_unlock(); + if (pkru != rdpkru()) + wrpkru(pkru); } static inline void pkru_write_default(void) diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2acd6cb62328..f3fbb84ff8a7 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -104,25 +104,13 @@ static inline void wrpkru(u32 pkru) : : "a" (pkru), "c"(ecx), "d"(edx)); } -static inline void __write_pkru(u32 pkru) -{ - /* - * WRPKRU is relatively expensive compared to RDPKRU. - * Avoid WRPKRU when it would not change the value. - */ - if (pkru == rdpkru()) - return; - - wrpkru(pkru); -} - #else static inline u32 rdpkru(void) { return 0; } -static inline void __write_pkru(u32 pkru) +static inline void wrpkru(u32 pkru) { } #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 07f788872ddf..8ee7add0e763 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -943,7 +943,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && vcpu->arch.pkru != vcpu->arch.host_pkru) - __write_pkru(vcpu->arch.pkru); + write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -957,7 +957,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) - __write_pkru(vcpu->arch.host_pkru); + write_pkru(vcpu->arch.host_pkru); } if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { -- cgit v1.2.3 From bf68a7d98922e1665019b8bf0c4791500837c857 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:24 +0200 Subject: x86/fpu: Mark init_fpstate __ro_after_init Nothing has to write into that state after init. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121456.992342060@linutronix.de --- arch/x86/kernel/fpu/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 5295cbafc92e..7ada7bd03a32 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -23,7 +23,7 @@ * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ -union fpregs_state init_fpstate __read_mostly; +union fpregs_state init_fpstate __ro_after_init; /* * Track whether the kernel is using the FPU state -- cgit v1.2.3 From 99a5901951b70251965b0d1542d4a8c616842a99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:25 +0200 Subject: x86/fpu/signal: Move initial checks into fpu__restore_sig() __fpu__restore_sig() is convoluted and some of the basic checks can trivially be done in the calling function as well as the final error handling of clearing user state. [ bp: Fixup typos. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.086336154@linutronix.de --- arch/x86/kernel/fpu/signal.c | 76 ++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a42bc9d0b1cc..42e85c3fe9de 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -277,11 +277,11 @@ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) return frstor_from_user_sigframe(buf); } -static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) +static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, + bool ia32_fxstate) { struct user_i387_ia32_struct *envp = NULL; int state_size = fpu_kernel_xstate_size; - int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; @@ -289,26 +289,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int fx_only = 0; int ret = 0; - ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || - IS_ENABLED(CONFIG_IA32_EMULATION)); - - if (!buf) { - fpu__clear_user_states(fpu); - return 0; - } - - if (!access_ok(buf, size)) { - ret = -EACCES; - goto out; - } - - if (!static_cpu_has(X86_FEATURE_FPU)) { - ret = fpregs_soft_set(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), - NULL, buf); - goto out; - } - if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { @@ -391,7 +371,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ ret = __copy_from_user(&env, buf, sizeof(env)); if (ret) - goto out; + return ret; envp = &env; } @@ -424,7 +404,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) - goto out; + return ret; sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, fx_only); @@ -442,10 +422,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); - if (ret) { - ret = -EFAULT; - goto out; - } + if (ret) + return -EFAULT; sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, fx_only); @@ -462,7 +440,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } else { ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); if (ret) - goto out; + return ret; fpregs_lock(); ret = frstor_safe(&fpu->state.fsave); @@ -472,10 +450,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) else fpregs_deactivate(fpu); fpregs_unlock(); - -out: - if (ret) - fpu__clear_user_states(fpu); return ret; } @@ -490,15 +464,47 @@ static inline int xstate_sigframe_size(void) */ int fpu__restore_sig(void __user *buf, int ia32_frame) { + unsigned int size = xstate_sigframe_size(); + struct fpu *fpu = ¤t->thread.fpu; void __user *buf_fx = buf; - int size = xstate_sigframe_size(); + bool ia32_fxstate = false; + int ret; + + if (unlikely(!buf)) { + fpu__clear_user_states(fpu); + return 0; + } + ia32_frame &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); + + /* + * Only FXSR enabled systems need the FX state quirk. + * FRSTOR does not need it and can use the fast path. + */ if (ia32_frame && use_fxsr()) { buf_fx = buf + sizeof(struct fregs_state); size += sizeof(struct fregs_state); + ia32_fxstate = true; } - return __fpu__restore_sig(buf, buf_fx, size); + if (!access_ok(buf, size)) { + ret = -EACCES; + goto out; + } + + if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) { + ret = fpregs_soft_set(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), + NULL, buf); + } else { + ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate); + } + +out: + if (unlikely(ret)) + fpu__clear_user_states(fpu); + return ret; } unsigned long -- cgit v1.2.3 From 9ba589f9cdbd8906465b108bc7ec0fc1519a06d3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:26 +0200 Subject: x86/fpu/signal: Remove the legacy alignment check Checking for the XSTATE buffer being 64-byte aligned, and if not, deciding just to restore the FXSR state is daft. If user space provides an unaligned math frame and has the extended state magic set in the FX software reserved bytes, then it really can keep the pieces. If the frame is unaligned and the FX software magic is not set, then fx_only is already set and the restore will use fxrstor. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.184149902@linutronix.de --- arch/x86/kernel/fpu/signal.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 42e85c3fe9de..8a327c05bb86 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -306,9 +306,6 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, } } - if ((unsigned long)buf_fx % 64) - fx_only = 1; - if (!ia32_fxstate) { /* * Attempt to restore the FPU registers directly from user -- cgit v1.2.3 From 1258a8c896044564514c1b53795ba3033b1e9fd6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:27 +0200 Subject: x86/fpu/signal: Sanitize the xstate check on sigframe Utilize the check for the extended state magic in the FX software reserved bytes and set the parameters for restoring fx_only in the relevant members of fw_sw_user. This allows further cleanups on top because the data is consistent. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.277738268@linutronix.de --- arch/x86/kernel/fpu/signal.c | 70 +++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 8a327c05bb86..d55241038871 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -15,29 +15,30 @@ #include #include -static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; +static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init; +static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init; /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. */ -static inline int check_for_xstate(struct fxregs_state __user *buf, - void __user *fpstate, - struct _fpx_sw_bytes *fx_sw) +static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, + struct _fpx_sw_bytes *fx_sw) { int min_xstate_size = sizeof(struct fxregs_state) + sizeof(struct xstate_header); + void __user *fpstate = fxbuf; unsigned int magic2; - if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) - return -1; + if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) + return -EFAULT; /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || fx_sw->xstate_size > fpu_user_xstate_size || fx_sw->xstate_size > fx_sw->extended_size) - return -1; + goto setfx; /* * Check for the presence of second magic word at the end of memory @@ -45,10 +46,18 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) - || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + return -EFAULT; + if (likely(magic2 == FP_XSTATE_MAGIC2)) + return 0; +setfx: + trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + + /* Set the parameters for fx only state */ + fx_sw->magic1 = 0; + fx_sw->xstate_size = sizeof(struct fxregs_state); + fx_sw->xfeatures = XFEATURE_MASK_FPSSE; return 0; } @@ -213,21 +222,15 @@ retry: static inline void sanitize_restored_user_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, - u64 user_xfeatures, int fx_only) + struct user_i387_ia32_struct *ia32_env, u64 mask) { struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { /* - * Clear all feature bits which are not set in - * user_xfeatures and clear all extended features - * for fx_only mode. - */ - u64 mask = fx_only ? XFEATURE_MASK_FPSSE : user_xfeatures; - - /* + * Clear all feature bits which are not set in mask. + * * Supervisor state has to be preserved. The sigframe * restore can only modify user features, i.e. @mask * cannot contain them. @@ -286,24 +289,19 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; u64 user_xfeatures = 0; - int fx_only = 0; + bool fx_only = false; int ret = 0; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; - if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { - /* - * Couldn't find the extended state information in the - * memory layout. Restore just the FP/SSE and init all - * the other extended state. - */ - state_size = sizeof(struct fxregs_state); - fx_only = 1; - trace_x86_fpu_xstate_check_failed(fpu); - } else { - state_size = fx_sw_user.xstate_size; - user_xfeatures = fx_sw_user.xfeatures; - } + + ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user); + if (unlikely(ret)) + return ret; + + fx_only = !fx_sw_user.magic1; + state_size = fx_sw_user.xstate_size; + user_xfeatures = fx_sw_user.xfeatures; } if (!ia32_fxstate) { @@ -403,8 +401,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (ret) return ret; - sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, - fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures); fpregs_lock(); if (unlikely(init_bv)) @@ -422,8 +419,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (ret) return -EFAULT; - sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, - fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures); fpregs_lock(); if (use_xsave()) { -- cgit v1.2.3 From cdcec1b77001e7f2cd10dccfc6d9b6d5d3f1f3ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:28 +0200 Subject: x86/fpu/signal: Sanitize copy_user_to_fpregs_zeroing() Now that user_xfeatures is correctly set when xsave is enabled, remove the duplicated initialization of components. Rename the function while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.377341297@linutronix.de --- arch/x86/kernel/fpu/signal.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index d55241038871..a1a70134e1fe 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -251,33 +251,27 @@ sanitize_restored_user_xstate(union fpregs_state *state, } /* - * Restore the extended state if present. Otherwise, restore the FP/SSE state. + * Restore the FPU state directly from the userspace signal frame. */ -static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { - u64 init_bv; - int r; - if (use_xsave()) { - if (fx_only) { - init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE; + u64 init_bv = xfeatures_mask_uabi() & ~xrestore; + int ret; - r = fxrstor_from_user_sigframe(buf); - if (!r) - os_xrstor(&init_fpstate.xsave, init_bv); - return r; - } else { - init_bv = xfeatures_mask_uabi() & ~xbv; - - r = xrstor_from_user_sigframe(buf, xbv); - if (!r && unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); - return r; - } + if (likely(!fx_only)) + ret = xrstor_from_user_sigframe(buf, xrestore); + else + ret = fxrstor_from_user_sigframe(buf); + + if (!ret && unlikely(init_bv)) + os_xrstor(&init_fpstate.xsave, init_bv); + return ret; } else if (use_fxsr()) { return fxrstor_from_user_sigframe(buf); - } else + } else { return frstor_from_user_sigframe(buf); + } } static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, @@ -314,7 +308,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, */ fpregs_lock(); pagefault_disable(); - ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); + ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); pagefault_enable(); if (!ret) { -- cgit v1.2.3 From 0a6c2e9ec91c96bde1e8ce063180ac6e05e680f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:29 +0200 Subject: x86/fpu/signal: Split out the direct restore code Prepare for smarter failure handling of the direct restore. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.493455414@linutronix.de --- arch/x86/kernel/fpu/signal.c | 112 ++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a1a70134e1fe..aa268d9cf228 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -250,10 +250,8 @@ sanitize_restored_user_xstate(union fpregs_state *state, } } -/* - * Restore the FPU state directly from the userspace signal frame. - */ -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) +static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only) { if (use_xsave()) { u64 init_bv = xfeatures_mask_uabi() & ~xrestore; @@ -274,6 +272,57 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only } } +static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) +{ + struct fpu *fpu = ¤t->thread.fpu; + int ret; + + fpregs_lock(); + pagefault_disable(); + ret = __restore_fpregs_from_user(buf, xrestore, fx_only); + pagefault_enable(); + + if (unlikely(ret)) { + /* + * The above did an FPU restore operation, restricted to + * the user portion of the registers, and failed, but the + * microcode might have modified the FPU registers + * nevertheless. + * + * If the FPU registers do not belong to current, then + * invalidate the FPU register state otherwise the task + * might preempt current and return to user space with + * corrupted FPU registers. + * + * In case current owns the FPU registers then no further + * action is required. The fixup in the slow path will + * handle it correctly. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __cpu_invalidate_fpregs_state(); + fpregs_unlock(); + return ret; + } + + /* + * Restore supervisor states: previous context switch etc has done + * XSAVES and saved the supervisor states in the kernel buffer from + * which they can be restored now. + * + * It would be optimal to handle this with a single XRSTORS, but + * this does not work because the rest of the FPU registers have + * been restored from a user buffer directly. The single XRSTORS + * happens below, when the user buffer has been copied to the + * kernel one. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) + os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); + + fpregs_mark_activate(); + fpregs_unlock(); + return 0; +} + static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { @@ -298,61 +347,16 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, user_xfeatures = fx_sw_user.xfeatures; } - if (!ia32_fxstate) { + if (likely(!ia32_fxstate)) { /* * Attempt to restore the FPU registers directly from user - * memory. For that to succeed, the user access cannot cause - * page faults. If it does, fall back to the slow path below, - * going through the kernel buffer with the enabled pagefault - * handler. + * memory. For that to succeed, the user access cannot cause page + * faults. If it does, fall back to the slow path below, going + * through the kernel buffer with the enabled pagefault handler. */ - fpregs_lock(); - pagefault_disable(); ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); - pagefault_enable(); - if (!ret) { - - /* - * Restore supervisor states: previous context switch - * etc has done XSAVES and saved the supervisor states - * in the kernel buffer from which they can be restored - * now. - * - * We cannot do a single XRSTORS here - which would - * be nice - because the rest of the FPU registers are - * being restored from a user buffer directly. The - * single XRSTORS happens below, when the user buffer - * has been copied to the kernel one. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD) && - xfeatures_mask_supervisor()) { - os_xrstor(&fpu->state.xsave, - xfeatures_mask_supervisor()); - } - fpregs_mark_activate(); - fpregs_unlock(); + if (likely(!ret)) return 0; - } - - /* - * The above did an FPU restore operation, restricted to - * the user portion of the registers, and failed, but the - * microcode might have modified the FPU registers - * nevertheless. - * - * If the FPU registers do not belong to current, then - * invalidate the FPU register state otherwise the task might - * preempt current and return to user space with corrupted - * FPU registers. - * - * In case current owns the FPU registers then no further - * action is required. The fixup below will handle it - * correctly. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - __cpu_invalidate_fpregs_state(); - - fpregs_unlock(); } else { /* * For 32-bit frames with fxstate, copy the fxstate so it can -- cgit v1.2.3 From aee8c67a4faa40a8df4e79316dbfc92d123989c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:30 +0200 Subject: x86/fpu: Return proper error codes from user access functions When *RSTOR from user memory raises an exception, there is no way to differentiate them. That's bad because it forces the slow path even when the failure was not a fault. If the operation raised eg. #GP then going through the slow path is pointless. Use _ASM_EXTABLE_FAULT() which stores the trap number and let the exception fixup return the negated trap number as error. This allows to separate the fast path and let it handle faults directly and avoid the slow path for all other exceptions. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.601480369@linutronix.de --- arch/x86/include/asm/fpu/internal.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 528a86826f95..5a18694a89b2 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -88,6 +88,7 @@ static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif extern void save_fpregs_to_fpstate(struct fpu *fpu); +/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ #define user_insn(insn, output, input...) \ ({ \ int err; \ @@ -95,14 +96,14 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); might_fault(); \ \ asm volatile(ASM_STAC "\n" \ - "1:" #insn "\n\t" \ + "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ - "3: movl $-1,%[err]\n" \ + "3: negl %%eax\n" \ " jmp 2b\n" \ ".previous\n" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err), output \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) @@ -196,16 +197,20 @@ static inline void fxsave(struct fxregs_state *fx) #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" +/* + * After this @err contains 0 on success or the negated trap number when + * the operation raises an exception. For faults this results in -EFAULT. + */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ ".pushsection .fixup,\"ax\"\n\t" \ - "3: movl $-2,%[err]\n\t" \ + "3: negl %%eax\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ - _ASM_EXTABLE(1b, 3b) \ - : [err] "=r" (err) \ + _ASM_EXTABLE_FAULT(1b, 3b) \ + : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") -- cgit v1.2.3 From fcb3635f5018e53024c6be3c3213737f469f74ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:31 +0200 Subject: x86/fpu/signal: Handle #PF in the direct restore path If *RSTOR raises an exception, then the slow path is taken. That's wrong because if the reason was not #PF then going through the slow path is waste of time because that will end up with the same conclusion that the data is invalid. Now that the wrapper around *RSTOR return an negative error code, which is the negated trap number, it's possible to differentiate. If the *RSTOR raised #PF then handle it directly in the fast path and if it was some other exception, e.g. #GP, then give up and do not try the fast path. This removes the legacy frame FRSTOR code from the slow path because FRSTOR is not a ia32_fxstate frame and is therefore handled in the fast path. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.696022863@linutronix.de --- arch/x86/kernel/fpu/signal.c | 67 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index aa268d9cf228..4c252d0c8e6a 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -272,11 +272,17 @@ static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, } } -static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) +/* + * Attempt to restore the FPU registers directly from user memory. + * Pagefaults are handled and any errors returned are fatal. + */ +static int restore_fpregs_from_user(void __user *buf, u64 xrestore, + bool fx_only, unsigned int size) { struct fpu *fpu = ¤t->thread.fpu; int ret; +retry: fpregs_lock(); pagefault_disable(); ret = __restore_fpregs_from_user(buf, xrestore, fx_only); @@ -293,14 +299,18 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only * invalidate the FPU register state otherwise the task * might preempt current and return to user space with * corrupted FPU registers. - * - * In case current owns the FPU registers then no further - * action is required. The fixup in the slow path will - * handle it correctly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD)) __cpu_invalidate_fpregs_state(); fpregs_unlock(); + + /* Try to handle #PF, but anything else is fatal. */ + if (ret != -EFAULT) + return -EINVAL; + + ret = fault_in_pages_readable(buf, size); + if (!ret) + goto retry; return ret; } @@ -311,9 +321,7 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only * * It would be optimal to handle this with a single XRSTORS, but * this does not work because the rest of the FPU registers have - * been restored from a user buffer directly. The single XRSTORS - * happens below, when the user buffer has been copied to the - * kernel one. + * been restored from a user buffer directly. */ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor()) os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor()); @@ -326,14 +334,13 @@ static int restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { - struct user_i387_ia32_struct *envp = NULL; int state_size = fpu_kernel_xstate_size; struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; u64 user_xfeatures = 0; bool fx_only = false; - int ret = 0; + int ret; if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -354,20 +361,19 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * faults. If it does, fall back to the slow path below, going * through the kernel buffer with the enabled pagefault handler. */ - ret = restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); - if (likely(!ret)) - return 0; - } else { - /* - * For 32-bit frames with fxstate, copy the fxstate so it can - * be reconstructed later. - */ - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - return ret; - envp = &env; + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, + state_size); } + /* + * Copy the legacy state because the FP portion of the FX frame has + * to be ignored for histerical raisins. The legacy state is folded + * in once the larger state has been copied. + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + return ret; + /* * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is * not modified on context switch and that the xstate is considered @@ -382,8 +388,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, * supervisor state is preserved. Save the full state for * simplicity. There is no point in optimizing this by only * saving the supervisor states and then shuffle them to - * the right place in memory. This is the slow path and the - * above XRSTOR failed or ia32_fxstate is true. Shrug. + * the right place in memory. It's ia32 mode. Shrug. */ if (xfeatures_mask_supervisor()) os_xsave(&fpu->state.xsave); @@ -399,7 +404,7 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, if (ret) return ret; - sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures); + sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures); fpregs_lock(); if (unlikely(init_bv)) @@ -412,12 +417,12 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, ret = os_xrstor_safe(&fpu->state.xsave, user_xfeatures | xfeatures_mask_supervisor()); - } else if (use_fxsr()) { + } else { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); if (ret) return -EFAULT; - sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures); + sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures); fpregs_lock(); if (use_xsave()) { @@ -428,14 +433,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, } ret = fxrstor_safe(&fpu->state.fxsave); - } else { - ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); - if (ret) - return ret; - - fpregs_lock(); - ret = frstor_safe(&fpu->state.fsave); } + if (!ret) fpregs_mark_activate(); else -- cgit v1.2.3 From 6f9866a166cd1ad3ebb2dcdb3874aa8fee8dea2f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Jun 2021 14:02:32 +0200 Subject: x86/fpu/signal: Let xrstor handle the features to init There is no reason to do an extra XRSTOR from init_fpstate for feature bits which have been cleared by user space in the FX magic xfeatures storage. Just clear them in the task's XSTATE header and do a full restore which will put these cleared features into init state. There is no real difference in performance because the current code already does a full restore when the xfeatures bits are preserved as the signal frame setup has stored them, which is the full UABI feature set. [ bp: Use the negated mxcsr_feature_mask in the MXCSR check. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20210623121457.804115017@linutronix.de --- arch/x86/kernel/fpu/signal.c | 89 +++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 4c252d0c8e6a..445c57c9c539 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -220,36 +220,6 @@ retry: return 0; } -static inline void -sanitize_restored_user_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, u64 mask) -{ - struct xregs_state *xsave = &state->xsave; - struct xstate_header *header = &xsave->header; - - if (use_xsave()) { - /* - * Clear all feature bits which are not set in mask. - * - * Supervisor state has to be preserved. The sigframe - * restore can only modify user features, i.e. @mask - * cannot contain them. - */ - header->xfeatures &= mask | xfeatures_mask_supervisor(); - } - - if (use_fxsr()) { - /* - * mscsr reserved bits must be masked to zero for security - * reasons. - */ - xsave->i387.mxcsr &= mxcsr_feature_mask; - - if (ia32_env) - convert_to_fxsr(&state->fxsave, ia32_env); - } -} - static int __restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { @@ -352,6 +322,8 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, fx_only = !fx_sw_user.magic1; state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; + } else { + user_xfeatures = XFEATURE_MASK_FPSSE; } if (likely(!ia32_fxstate)) { @@ -395,54 +367,55 @@ static int __fpu_restore_sig(void __user *buf, void __user *buf_fx, set_thread_flag(TIF_NEED_FPU_LOAD); } __fpu_invalidate_fpregs_state(fpu); + __cpu_invalidate_fpregs_state(); fpregs_unlock(); if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask_uabi() & ~user_xfeatures; - ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx); if (ret) return ret; + } else { + if (__copy_from_user(&fpu->state.fxsave, buf_fx, + sizeof(fpu->state.fxsave))) + return -EFAULT; - sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures); + /* Reject invalid MXCSR values. */ + if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask) + return -EINVAL; - fpregs_lock(); - if (unlikely(init_bv)) - os_xrstor(&init_fpstate.xsave, init_bv); + /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */ + if (use_xsave()) + fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE; + } + + /* Fold the legacy FP storage */ + convert_to_fxsr(&fpu->state.fxsave, &env); + fpregs_lock(); + if (use_xsave()) { /* - * Restore previously saved supervisor xstates along with - * copied-in user xstates. + * Remove all UABI feature bits not set in user_xfeatures + * from the memory xstate header which makes the full + * restore below bring them into init state. This works for + * fx_only mode as well because that has only FP and SSE + * set in user_xfeatures. + * + * Preserve supervisor states! */ - ret = os_xrstor_safe(&fpu->state.xsave, - user_xfeatures | xfeatures_mask_supervisor()); + u64 mask = user_xfeatures | xfeatures_mask_supervisor(); + fpu->state.xsave.header.xfeatures &= mask; + ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all); } else { - ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); - if (ret) - return -EFAULT; - - sanitize_restored_user_xstate(&fpu->state, &env, user_xfeatures); - - fpregs_lock(); - if (use_xsave()) { - u64 init_bv; - - init_bv = xfeatures_mask_uabi() & ~XFEATURE_MASK_FPSSE; - os_xrstor(&init_fpstate.xsave, init_bv); - } - ret = fxrstor_safe(&fpu->state.fxsave); } - if (!ret) + if (likely(!ret)) fpregs_mark_activate(); - else - fpregs_deactivate(fpu); + fpregs_unlock(); return ret; } - static inline int xstate_sigframe_size(void) { return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : -- cgit v1.2.3 From 7f049fbdd57f6ea71dc741d903c19c73b2f70950 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Jun 2021 15:03:16 +0200 Subject: perf/x86/intel/lbr: Zero the xstate buffer on allocation XRSTORS requires a valid xstate buffer to work correctly. XSAVES does not guarantee to write a fully valid buffer according to the SDM: "XSAVES does not write to any parts of the XSAVE header other than the XSTATE_BV and XCOMP_BV fields." XRSTORS triggers a #GP: "If bytes 63:16 of the XSAVE header are not all zero." It's dubious at best how this can work at all when the buffer is not zeroed before use. Allocate the buffers with __GFP_ZERO to prevent XRSTORS failure. Fixes: ce711ea3cab9 ("perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR context switch") Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/87wnr0wo2z.ffs@nanos.tec.linutronix.de --- arch/x86/events/intel/lbr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4409d2cccfda..e8453de7a964 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -731,7 +731,8 @@ void reserve_lbr_buffers(void) if (!kmem_cache || cpuc->lbr_xsave) continue; - cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL, + cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, + GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); } } -- cgit v1.2.3 From f9b871c89ae61d5a4c0b81659fa6819c50d4ced2 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 16 Jun 2021 20:15:30 +0200 Subject: x86/resctrl: Fix kernel-doc in pseudo_lock.c Add undocumented parameters detected by scripts/kernel-doc. Signed-off-by: Fabio M. De Francesco Signed-off-by: Borislav Petkov Reviewed-by: Reinette Chatre Link: https://lkml.kernel.org/r/20210616181530.4094-1-fmdefrancesco@gmail.com --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 05a89e33fde2..2207916cae65 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -49,6 +49,7 @@ static struct class *pseudo_lock_class; /** * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support * pseudo-locking. This includes testing to ensure pseudo-locked regions @@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor) } /** - * pseudo_lock_pm_req - A power management QoS request list entry + * struct pseudo_lock_pm_req - A power management QoS request list entry * @list: Entry within the @pm_reqs list for a pseudo-locked region * @req: PM QoS request */ @@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) /** * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region * * To prevent the cache from being affected by power management entering * C6 has to be avoided. This is accomplished by requesting a latency @@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) * the ACPI latencies need to be considered while keeping in mind that C2 * may be set to map to deeper sleep states. In this case the latency * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure */ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) { @@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp) /** * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @r: resource group being queried + * @rdtgrp: resource group being queried * * Return: 1 if monitor groups have been created for this resource * group, 0 otherwise. @@ -1140,6 +1144,8 @@ out: /** * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. * * The measurement of latency to access a pseudo-locked region should be * done from a cpu that is associated with that pseudo-locked region. -- cgit v1.2.3 From fd2afa70eff057fab57c9e06708b68677b261a0c Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sat, 19 Jun 2021 00:32:06 +0200 Subject: x86/resctrl: Fix kernel-doc in internal.h Add description of undocumented parameters. Issues detected by scripts/kernel-doc. Signed-off-by: Fabio M. De Francesco Signed-off-by: Borislav Petkov Reviewed-by: Reinette Chatre Link: https://lkml.kernel.org/r/20210618223206.29539-1-fmdefrancesco@gmail.com --- arch/x86/kernel/cpu/resctrl/internal.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c4d320d02fd5..6a5f60a37219 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -70,6 +70,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * struct mon_evt - Entry in the event list of a resource * @evtid: event id * @name: name of the event + * @list: entry in &rdt_resource->evt_list */ struct mon_evt { u32 evtid; @@ -78,10 +79,13 @@ struct mon_evt { }; /** - * struct mon_data_bits - Monitoring details for each event file - * @rid: Resource id associated with the event file. + * union mon_data_bits - Monitoring details for each event file + * @priv: Used to store monitoring event data in @u + * as kernfs private data + * @rid: Resource id associated with the event file * @evtid: Event id associated with the event file * @domid: The domain to which the event file belongs + * @u: Name of the bit fields struct */ union mon_data_bits { void *priv; @@ -119,6 +123,7 @@ enum rdt_group_type { * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes * * The mode of a resource group enables control over the allowed overlap * between allocations associated with different resource groups (classes @@ -142,7 +147,7 @@ enum rdtgrp_mode { /** * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn kernlfs node for the mon_data directory + * @mon_data_kn: kernfs node for the mon_data directory * @parent: parent rdtgrp * @crdtgrp_list: child rdtgroup node list * @rmid: rmid for this rdtgroup @@ -282,11 +287,11 @@ struct rftype { /** * struct mbm_state - status for each MBM counter in each domain * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) - * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting - * @prev_bw The most recent bandwidth in MBps - * @delta_bw Difference between the current and previous bandwidth - * @delta_comp Indicates whether to compute the delta_bw + * @prev_bw: The most recent bandwidth in MBps + * @delta_bw: Difference between the current and previous bandwidth + * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 chunks; @@ -456,11 +461,13 @@ struct rdt_parse_data { * @data_width: Character width of data when displaying * @domains: All domains for this resource * @cache: Cache allocation related data + * @membw: If the component has bandwidth controls, their properties. * @format_str: Per resource format string to show domain value * @parse_ctrlval: Per resource function pointer to parse control values * @evt_list: List of monitoring events * @num_rmid: Number of RMIDs available * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @mbm_width: Monitor width, to detect and correct for overflow. * @fflags: flags to choose base and info files */ struct rdt_resource { -- cgit v1.2.3 From 18f63b15b0283d6f37be3174e2c7b6f2d6ed91cf Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 21 Jun 2021 15:16:48 -0700 Subject: KVM: x86: Print CPU of last attempted VM-entry when dumping VMCS/VMCB Failed VM-entry is often due to a faulty core. To help identify bad cores, print the id of the last logical processor that attempted VM-entry whenever dumping a VMCS or VMCB. Signed-off-by: Jim Mattson Message-Id: <20210621221648.1833148-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4cee285b0185..8834822c00cd 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3132,6 +3132,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) return; } + pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", + svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); pr_err("VMCB Control Area:\n"); pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab6f682645d7..684daa3eefc2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5724,6 +5724,8 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", + vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); pr_err("*** Guest State ***\n"); pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), -- cgit v1.2.3 From 31c656570065727028f96c811b5ea9fc61502a18 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 22 Jun 2021 16:09:12 +0100 Subject: KVM: x86/mmu: Fix uninitialized boolean variable flush In the case where kvm_memslots_have_rmaps(kvm) is false the boolean variable flush is not set and is uninitialized. If is_tdp_mmu_enabled(kvm) is true then the call to kvm_tdp_mmu_zap_collapsible_sptes passes the uninitialized value of flush into the call. Fix this by initializing flush to false. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: e2209710ccc5 ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Colin Ian King Reviewed-by: Sean Christopherson Message-Id: <20210622150912.23429-1-colin.king@canonical.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 84d48a33e38b..b3be690d081a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5689,7 +5689,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, { /* FIXME: const-ify all uses of struct kvm_memory_slot. */ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; - bool flush; + bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); -- cgit v1.2.3 From b33bb78a1fada6445c265c585ee0dd0fc6279102 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:22:44 -0700 Subject: KVM: nVMX: Handle split-lock #AC exceptions that happen in L2 Mark #ACs that won't be reinjected to the guest as wanted by L0 so that KVM handles split-lock #AC from L2 instead of forwarding the exception to L1. Split-lock #AC isn't yet virtualized, i.e. L1 will treat it like a regular #AC and do the wrong thing, e.g. reinject it into L2. Fixes: e6f8b6c12f03 ("KVM: VMX: Extend VMXs #AC interceptor to handle split lock #AC in guest") Cc: Xiaoyao Li Signed-off-by: Sean Christopherson Message-Id: <20210622172244.3561540-1-seanjc@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 +++ arch/x86/kvm/vmx/vmcs.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx.h | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 183fd9d62fc5..fa3f50f0a3fa 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5833,6 +5833,9 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_breakpoint(intr_info) && vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return true; + else if (is_alignment_check(intr_info) && + !vmx_guest_inject_ac(vcpu)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index de3b04d4b587..4b9957e2bf5b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -117,6 +117,11 @@ static inline bool is_gp_fault(u32 intr_info) return is_exception_n(intr_info, GP_VECTOR); } +static inline bool is_alignment_check(u32 intr_info) +{ + return is_exception_n(intr_info, AC_VECTOR); +} + static inline bool is_machine_check(u32 intr_info) { return is_exception_n(intr_info, MC_VECTOR); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 684daa3eefc2..5a1067c42f3a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4743,7 +4743,7 @@ static int handle_machine_check(struct kvm_vcpu *vcpu) * - Guest has #AC detection enabled in CR0 * - Guest EFLAGS has AC bit set */ -static inline bool guest_inject_ac(struct kvm_vcpu *vcpu) +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) { if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) return true; @@ -4851,7 +4851,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_run->debug.arch.exception = ex_no; break; case AC_VECTOR: - if (guest_inject_ac(vcpu)) { + if (vmx_guest_inject_ac(vcpu)) { kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 5740f8e2aa23..3979a947933a 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -376,6 +376,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From ced50fc49f3bde2892c3d7fad7b3b6bfbc6ef90e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 23 Jun 2021 13:25:04 +0200 Subject: bpf, x86: Remove unused cnt increase from EMIT macro Removing unused cnt increase from EMIT macro together with cnt declarations. This was introduced in commit [1] to ensure proper code generation. But that code was removed in commit [2] and this extra code was left in. [1] b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") [2] ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210623112504.709856-1-jolsa@kernel.org --- arch/x86/net/bpf_jit_comp.c | 44 ++++++++++++-------------------------------- 1 file changed, 12 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2a2e290fa5d8..db1e83813db5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -31,7 +31,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) } #define EMIT(bytes, len) \ - do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) + do { prog = emit_code(prog, bytes, len); } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -239,7 +239,6 @@ struct jit_context { static void push_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[0]) EMIT1(0x53); /* push rbx */ @@ -255,7 +254,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used) static void pop_callee_regs(u8 **pprog, bool *callee_regs_used) { u8 *prog = *pprog; - int cnt = 0; if (callee_regs_used[3]) EMIT2(0x41, 0x5F); /* pop r15 */ @@ -277,13 +275,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog) { u8 *prog = *pprog; - int cnt = X86_PATCH_SIZE; /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - memcpy(prog, x86_nops[5], cnt); - prog += cnt; + memcpy(prog, x86_nops[5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { if (tail_call_reachable && !is_subprog) EMIT2(0x31, 0xC0); /* xor eax, eax */ @@ -303,7 +300,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + X86_PATCH_SIZE); @@ -423,7 +419,6 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, int off1 = 42; int off2 = 31; int off3 = 9; - int cnt = 0; /* count the additional bytes used for popping callee regs from stack * that need to be taken into account for each of the offsets that @@ -513,7 +508,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int pop_bytes = 0; int off1 = 20; int poke_off; - int cnt = 0; /* count the additional bytes used for popping callee regs to stack * that need to be taken into account for jump offset that is used for @@ -615,7 +609,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate, { u8 *prog = *pprog; u8 b1, b2, b3; - int cnt = 0; /* * Optimization: if imm32 is positive, use 'mov %eax, imm32' @@ -655,7 +648,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo) { u8 *prog = *pprog; - int cnt = 0; if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) { /* @@ -678,7 +670,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg, static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) { u8 *prog = *pprog; - int cnt = 0; if (is64) { /* mov dst, src */ @@ -697,7 +688,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) { u8 *prog = *pprog; - int cnt = 0; if (is_imm8(off)) { /* 1-byte signed displacement. @@ -720,7 +710,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off) static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) { u8 *prog = *pprog; - int cnt = 0; if (is64) EMIT1(add_2mod(0x48, dst_reg, src_reg)); @@ -733,7 +722,6 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64) static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -764,7 +752,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { u8 *prog = *pprog; - int cnt = 0; switch (size) { case BPF_B: @@ -799,7 +786,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; - int cnt = 0; EMIT1(0xF0); /* lock prefix */ @@ -869,10 +855,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt, } } -static int emit_nops(u8 **pprog, int len) +static void emit_nops(u8 **pprog, int len) { u8 *prog = *pprog; - int i, noplen, cnt = 0; + int i, noplen; while (len > 0) { noplen = len; @@ -886,8 +872,6 @@ static int emit_nops(u8 **pprog, int len) } *pprog = prog; - - return cnt; } #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) @@ -902,7 +886,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0, excnt = 0; + int i, excnt = 0; int ilen, proglen = 0; u8 *prog = temp; int err; @@ -1576,7 +1560,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } EMIT2(jmp_cond, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1622,7 +1606,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */ nops); return -EFAULT; } - cnt += emit_nops(&prog, nops); + emit_nops(&prog, nops); } break; } @@ -1647,7 +1631,7 @@ emit_jmp: nops); return -EFAULT; } - cnt += emit_nops(&prog, INSN_SZ_DIFF - 2); + emit_nops(&prog, INSN_SZ_DIFF - 2); } EMIT2(0xEB, jmp_offset); } else if (is_simm32(jmp_offset)) { @@ -1754,7 +1738,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, { u8 *prog = *pprog; u8 *jmp_insn; - int cnt = 0; /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); @@ -1822,7 +1805,6 @@ static void emit_align(u8 **pprog, u32 align) static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) { u8 *prog = *pprog; - int cnt = 0; s64 offset; offset = func - (ip + 2 + 4); @@ -1854,7 +1836,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, u8 **branches) { u8 *prog = *pprog; - int i, cnt = 0; + int i; /* The first fmod_ret program will receive a garbage return value. * Set this to 0 to avoid confusing the program. @@ -1950,7 +1932,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *tprogs, void *orig_call) { - int ret, i, cnt = 0, nr_args = m->nr_args; + int ret, i, nr_args = m->nr_args; int stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; @@ -2095,8 +2077,6 @@ static int emit_fallback_jump(u8 **pprog) */ err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog); #else - int cnt = 0; - EMIT2(0xFF, 0xE2); /* jmp rdx */ #endif *pprog = prog; @@ -2106,7 +2086,7 @@ static int emit_fallback_jump(u8 **pprog) static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs) { u8 *jg_reloc, *prog = *pprog; - int pivot, err, jg_bytes = 1, cnt = 0; + int pivot, err, jg_bytes = 1; s64 jg_offset; if (a == b) { -- cgit v1.2.3 From 93c2cdc975aab53c222472c5b96c2d41dbeb350c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Jun 2021 17:09:18 +0200 Subject: x86/fpu/xstate: Clear xstate header in copy_xstate_to_uabi_buf() again The change which made copy_xstate_to_uabi_buf() usable for [x]fpregs_get() removed the zeroing of the header which means the header, which is copied to user space later, contains except for the xfeatures member, random stack content. Add the memset() back to zero it before usage. Fixes: eb6f51723f03 ("x86/fpu: Make copy_xstate_to_kernel() usable for [x]fpregs_get()") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/875yy3wb8h.ffs@nanos.tec.linutronix.de --- arch/x86/kernel/fpu/xstate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 21a10a66c4e4..c8def1b7f8fb 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -982,6 +982,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, unsigned int zerofrom; int i; + memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ -- cgit v1.2.3 From 6c6e166b2c8513721d166c74060d26d3f4aecb48 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 00:24:54 -0700 Subject: KVM: x86/mmu: Don't WARN on a NULL shadow page in TDP MMU check Treat a NULL shadow page in the "is a TDP MMU" check as valid, non-TDP root. KVM uses a "direct" PAE paging MMU when TDP is disabled and the guest is running with paging disabled. In that case, root_hpa points at the pae_root page (of which only 32 bytes are used), not a standard shadow page, and the WARN fires (a lot). Fixes: 0b873fd7fb53 ("KVM: x86/mmu: Remove redundant is_tdp_mmu_enabled check") Cc: David Matlack Signed-off-by: Sean Christopherson Message-Id: <20210622072454.3449146-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index b981a044ab55..1cae4485b3bc 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -94,11 +94,13 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) if (WARN_ON(!VALID_PAGE(hpa))) return false; + /* + * A NULL shadow page is legal when shadowing a non-paging guest with + * PAE paging, as the MMU will be direct with root_hpa pointing at the + * pae_root page, not a shadow page. + */ sp = to_shadow_page(hpa); - if (WARN_ON(!sp)) - return false; - - return is_tdp_mmu_page(sp) && sp->root_count; + return sp && is_tdp_mmu_page(sp) && sp->root_count; } #else static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } -- cgit v1.2.3 From 0193cc908b5ae8aff2e2d2997ca5d4ae26ed24d4 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 18 Jun 2021 22:27:03 +0000 Subject: KVM: stats: Separate generic stats from architecture specific ones Generic KVM stats are those collected in architecture independent code or those supported by all architectures; put all generic statistics in a separate structure. This ensures that they are defined the same way in the statistics API which is being added, removing duplication among different architectures in the declaration of the descriptors. No functional change intended. Reviewed-by: David Matlack Reviewed-by: Ricardo Koller Reviewed-by: Krish Sadhukhan Signed-off-by: Jing Zhang Message-Id: <20210618222709.1858088-2-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 9 ++------- arch/arm64/kvm/guest.c | 12 ++++++------ arch/mips/include/asm/kvm_host.h | 9 ++------- arch/mips/kvm/mips.c | 12 ++++++------ arch/powerpc/include/asm/kvm_host.h | 9 ++------- arch/powerpc/kvm/book3s.c | 12 ++++++------ arch/powerpc/kvm/book3s_hv.c | 12 ++++++------ arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/book3s_pr_papr.c | 2 +- arch/powerpc/kvm/booke.c | 14 +++++++------- arch/s390/include/asm/kvm_host.h | 9 ++------- arch/s390/kvm/kvm-s390.c | 12 ++++++------ arch/x86/include/asm/kvm_host.h | 9 ++------- arch/x86/kvm/x86.c | 14 +++++++------- include/linux/kvm_types.h | 12 ++++++++++++ virt/kvm/kvm_main.c | 14 +++++++------- 16 files changed, 75 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d56f365b38a8..5a2c82f63baa 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -556,16 +556,11 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) } struct kvm_vm_stat { - u64 remote_tlb_flush; + struct kvm_vm_stat_generic generic; }; struct kvm_vcpu_stat { - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; - u64 halt_poll_invalid; - u64 halt_wakeup; + struct kvm_vcpu_stat_generic generic; u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 5cb4a1cd5603..988ead309cbe 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -29,18 +29,18 @@ #include "trace.h" struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), VCPU_STAT("hvc_exit_stat", hvc_exit_stat), VCPU_STAT("wfe_exit_stat", wfe_exit_stat), VCPU_STAT("wfi_exit_stat", wfi_exit_stat), VCPU_STAT("mmio_exit_user", mmio_exit_user), VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), VCPU_STAT("exits", exits), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), { NULL } }; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 4245c082095f..696f6b009377 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -109,10 +109,11 @@ static inline bool kvm_is_error_hva(unsigned long addr) } struct kvm_vm_stat { - u64 remote_tlb_flush; + struct kvm_vm_stat_generic generic; }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 wait_exits; u64 cache_exits; u64 signal_exits; @@ -142,12 +143,6 @@ struct kvm_vcpu_stat { #ifdef CONFIG_CPU_LOONGSON64 u64 vz_cpucfg_exits; #endif - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; - u64 halt_poll_invalid; - u64 halt_wakeup; }; struct kvm_arch_memory_slot { diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 4d4af97dcc88..2f2969aef60c 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -68,12 +68,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { #ifdef CONFIG_CPU_LOONGSON64 VCPU_STAT("vz_cpucfg", vz_cpucfg_exits), #endif - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), {NULL} }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index dd8bd4706259..9f52f282b1aa 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -81,12 +81,13 @@ struct kvmppc_book3s_shadow_vcpu; struct kvm_nested_guest; struct kvm_vm_stat { - u64 remote_tlb_flush; + struct kvm_vm_stat_generic generic; u64 num_2M_pages; u64 num_1G_pages; }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 sum_exits; u64 mmio_exits; u64 signal_exits; @@ -102,14 +103,8 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; u64 halt_wait_ns; - u64 halt_successful_poll; - u64 halt_attempted_poll; u64 halt_successful_wait; - u64 halt_poll_invalid; - u64 halt_wakeup; u64 dbell_exits; u64 gdbell_exits; u64 ld; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 5e1e1cff0ee3..ae9f1b855ff9 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -47,14 +47,14 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("dec", dec_exits), VCPU_STAT("ext_intr", ext_intr_exits), VCPU_STAT("queue_intr", queue_intr), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), VCPU_STAT("halt_wait_ns", halt_wait_ns), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), VCPU_STAT("halt_successful_wait", halt_successful_wait), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), VCPU_STAT("pf_storage", pf_storage), VCPU_STAT("sp_storage", sp_storage), VCPU_STAT("pf_instruc", pf_instruc), diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7e73e5bfe4ba..cd544a46183e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -230,7 +230,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) waitp = kvm_arch_vcpu_get_wait(vcpu); if (rcuwait_wake_up(waitp)) - ++vcpu->stat.halt_wakeup; + ++vcpu->stat.generic.halt_wakeup; cpu = READ_ONCE(vcpu->arch.thread_cpu); if (cpu >= 0 && kvmppc_ipi_thread(cpu)) @@ -4092,7 +4092,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) cur = start_poll = ktime_get(); if (vc->halt_poll_ns) { ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); - ++vc->runner->stat.halt_attempted_poll; + ++vc->runner->stat.generic.halt_attempted_poll; vc->vcore_state = VCORE_POLLING; spin_unlock(&vc->lock); @@ -4109,7 +4109,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_INACTIVE; if (!do_sleep) { - ++vc->runner->stat.halt_successful_poll; + ++vc->runner->stat.generic.halt_successful_poll; goto out; } } @@ -4121,7 +4121,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) do_sleep = 0; /* If we polled, count this as a successful poll */ if (vc->halt_poll_ns) - ++vc->runner->stat.halt_successful_poll; + ++vc->runner->stat.generic.halt_successful_poll; goto out; } @@ -4148,13 +4148,13 @@ out: ktime_to_ns(cur) - ktime_to_ns(start_wait); /* Attribute failed poll time */ if (vc->halt_poll_ns) - vc->runner->stat.halt_poll_fail_ns += + vc->runner->stat.generic.halt_poll_fail_ns += ktime_to_ns(start_wait) - ktime_to_ns(start_poll); } else { /* Attribute successful poll time */ if (vc->halt_poll_ns) - vc->runner->stat.halt_poll_success_ns += + vc->runner->stat.generic.halt_poll_success_ns += ktime_to_ns(cur) - ktime_to_ns(start_poll); } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d7733b07f489..71bcb0140461 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -493,7 +493,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); - vcpu->stat.halt_wakeup++; + vcpu->stat.generic.halt_wakeup++; /* Unset POW bit after we woke up */ msr &= ~MSR_POW; diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 031c8015864a..ac14239f3424 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -378,7 +378,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); - vcpu->stat.halt_wakeup++; + vcpu->stat.generic.halt_wakeup++; return EMULATE_DONE; case H_LOGICAL_CI_LOAD: return kvmppc_h_pr_logical_ci_load(vcpu); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7d5fe43f85c4..7a75559ab51d 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -49,15 +49,15 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("inst_emu", emulated_inst_exits), VCPU_STAT("dec", dec_exits), VCPU_STAT("ext_intr", ext_intr_exits), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), VCPU_STAT("doorbell", dbell_exits), VCPU_STAT("guest doorbell", gdbell_exits), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), - VM_STAT("remote_tlb_flush", remote_tlb_flush), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), + VM_STAT("remote_tlb_flush", generic.remote_tlb_flush), { NULL } }; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8925f3969478..9b4473f76e56 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -361,6 +361,7 @@ struct sie_page { }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 exit_userspace; u64 exit_null; u64 exit_external_request; @@ -370,13 +371,7 @@ struct kvm_vcpu_stat { u64 exit_validity; u64 exit_instruction; u64 exit_pei; - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_invalid; u64 halt_no_poll_steal; - u64 halt_wakeup; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; u64 instruction_lctl; u64 instruction_lctlg; u64 instruction_stctl; @@ -755,12 +750,12 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; u64 inject_io; u64 inject_float_mchk; u64 inject_pfault_done; u64 inject_service_signal; u64 inject_virtio; - u64 remote_tlb_flush; }; struct kvm_arch_memory_slot { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1296fc10f80c..75ad44c44717 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -72,13 +72,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("exit_program_interruption", exit_program_interruption), VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program), VCPU_STAT("exit_operation_exception", exit_operation_exception), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), - VCPU_STAT("halt_wakeup", halt_wakeup), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), VCPU_STAT("instruction_lctlg", instruction_lctlg), VCPU_STAT("instruction_lctl", instruction_lctl), VCPU_STAT("instruction_stctl", instruction_stctl), diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e11d64aa0bcd..408051552121 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1160,6 +1160,7 @@ struct kvm_arch { }; struct kvm_vm_stat { + struct kvm_vm_stat_generic generic; u64 mmu_shadow_zapped; u64 mmu_pte_write; u64 mmu_pde_zapped; @@ -1167,13 +1168,13 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; - u64 remote_tlb_flush; u64 lpages; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { + struct kvm_vcpu_stat_generic generic; u64 pf_fixed; u64 pf_guest; u64 tlb_flush; @@ -1187,10 +1188,6 @@ struct kvm_vcpu_stat { u64 nmi_window_exits; u64 l1d_flush; u64 halt_exits; - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_invalid; - u64 halt_wakeup; u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; @@ -1201,8 +1198,6 @@ struct kvm_vcpu_stat { u64 irq_injections; u64 nmi_injections; u64 req_event; - u64 halt_poll_success_ns; - u64 halt_poll_fail_ns; u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38c003b60339..71202330848a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -235,10 +235,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("irq_window", irq_window_exits), VCPU_STAT("nmi_window", nmi_window_exits), VCPU_STAT("halt_exits", halt_exits), - VCPU_STAT("halt_successful_poll", halt_successful_poll), - VCPU_STAT("halt_attempted_poll", halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", halt_poll_invalid), - VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), + VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), + VCPU_STAT("halt_wakeup", generic.halt_wakeup), VCPU_STAT("hypercalls", hypercalls), VCPU_STAT("request_irq", request_irq_exits), VCPU_STAT("irq_exits", irq_exits), @@ -250,8 +250,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("nmi_injections", nmi_injections), VCPU_STAT("req_event", req_event), VCPU_STAT("l1d_flush", l1d_flush), - VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), VCPU_STAT("nested_run", nested_run), VCPU_STAT("directed_yield_attempted", directed_yield_attempted), VCPU_STAT("directed_yield_successful", directed_yield_successful), @@ -263,7 +263,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VM_STAT("mmu_recycled", mmu_recycled), VM_STAT("mmu_cache_miss", mmu_cache_miss), VM_STAT("mmu_unsync", mmu_unsync), - VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("remote_tlb_flush", generic.remote_tlb_flush), VM_STAT("largepages", lpages, .mode = 0444), VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index a7580f69dda0..48db778291b7 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -76,5 +76,17 @@ struct kvm_mmu_memory_cache { }; #endif +struct kvm_vm_stat_generic { + u64 remote_tlb_flush; +}; + +struct kvm_vcpu_stat_generic { + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; +}; #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed4d1581d502..cec986487b30 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -332,7 +332,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) */ if (!kvm_arch_flush_remote_tlb(kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) - ++kvm->stat.remote_tlb_flush; + ++kvm->stat.generic.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); } EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); @@ -3029,9 +3029,9 @@ static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) { if (waited) - vcpu->stat.halt_poll_fail_ns += poll_ns; + vcpu->stat.generic.halt_poll_fail_ns += poll_ns; else - vcpu->stat.halt_poll_success_ns += poll_ns; + vcpu->stat.generic.halt_poll_success_ns += poll_ns; } /* @@ -3049,16 +3049,16 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); - ++vcpu->stat.halt_attempted_poll; + ++vcpu->stat.generic.halt_attempted_poll; do { /* * This sets KVM_REQ_UNHALT if an interrupt * arrives. */ if (kvm_vcpu_check_block(vcpu) < 0) { - ++vcpu->stat.halt_successful_poll; + ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) - ++vcpu->stat.halt_poll_invalid; + ++vcpu->stat.generic.halt_poll_invalid; goto out; } poll_end = cur = ktime_get(); @@ -3115,7 +3115,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) waitp = kvm_arch_vcpu_get_wait(vcpu); if (rcuwait_wake_up(waitp)) { WRITE_ONCE(vcpu->ready, true); - ++vcpu->stat.halt_wakeup; + ++vcpu->stat.generic.halt_wakeup; return true; } -- cgit v1.2.3 From cb082bfab59a224a49ae803fed52cd03e8d6b5e0 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 18 Jun 2021 22:27:04 +0000 Subject: KVM: stats: Add fd-based API to read binary stats data This commit defines the API for userspace and prepare the common functionalities to support per VM/VCPU binary stats data readings. The KVM stats now is only accessible by debugfs, which has some shortcomings this change series are supposed to fix: 1. The current debugfs stats solution in KVM could be disabled when kernel Lockdown mode is enabled, which is a potential rick for production. 2. The current debugfs stats solution in KVM is organized as "one stats per file", it is good for debugging, but not efficient for production. 3. The stats read/clear in current debugfs solution in KVM are protected by the global kvm_lock. Besides that, there are some other benefits with this change: 1. All KVM VM/VCPU stats can be read out in a bulk by one copy to userspace. 2. A schema is used to describe KVM statistics. From userspace's perspective, the KVM statistics are self-describing. 3. With the fd-based solution, a separate telemetry would be able to read KVM stats in a less privileged environment. 4. After the initial setup by reading in stats descriptors, a telemetry only needs to read the stats data itself, no more parsing or setup is needed. Reviewed-by: David Matlack Reviewed-by: Ricardo Koller Reviewed-by: Krish Sadhukhan Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba #arm64 Signed-off-by: Jing Zhang Message-Id: <20210618222709.1858088-3-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/Makefile | 2 +- arch/mips/kvm/Makefile | 2 +- arch/powerpc/kvm/Makefile | 2 +- arch/s390/kvm/Makefile | 3 +- arch/x86/kvm/Makefile | 2 +- include/linux/kvm_host.h | 82 +++++++++++++++++++++++++- include/linux/kvm_types.h | 2 + include/uapi/linux/kvm.h | 73 +++++++++++++++++++++++ virt/kvm/binary_stats.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 307 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/binary_stats.c (limited to 'arch/x86') diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 589921392cb1..989bb5dad2c8 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM) += hyp/ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ - $(KVM)/vfio.o $(KVM)/irqchip.o \ + $(KVM)/vfio.o $(KVM)/irqchip.o $(KVM)/binary_stats.o \ arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ guest.o debug.o reset.o sys_regs.o \ diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 30cc060857c7..c67250a956b8 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -2,7 +2,7 @@ # Makefile for KVM support for MIPS # -common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o) +common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o eventfd.o binary_stats.o) EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index ab241317481c..583c14ef596e 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -6,7 +6,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm -common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o +common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 12decca22e7c..b3aaadc60ead 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -4,7 +4,8 @@ # Copyright IBM Corp. 2008 KVM := ../../../virt/kvm -common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqchip.o $(KVM)/vfio.o +common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o \ + $(KVM)/irqchip.o $(KVM)/vfio.o $(KVM)/binary_stats.o ccflags-y := -Ivirt/kvm -Iarch/s390/kvm diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 83331376b779..75dfd27b6e8a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -11,7 +11,7 @@ KVM := ../../../virt/kvm kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \ - $(KVM)/dirty_ring.o + $(KVM)/dirty_ring.o $(KVM)/binary_stats.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 37cbb56ccd09..9ee7f350473b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1272,16 +1272,94 @@ struct kvm_stats_debugfs_item { int mode; }; +struct _kvm_stats_desc { + struct kvm_stats_desc desc; + char name[KVM_STATS_NAME_SIZE]; +}; + #define KVM_DBGFS_GET_MODE(dbgfs_item) \ ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) -#define VM_STAT(n, x, ...) \ +#define VM_STAT(n, x, ...) \ { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } -#define VCPU_STAT(n, x, ...) \ +#define VCPU_STAT(n, x, ...) \ { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } +#define STATS_DESC_COMMON(type, unit, base, exp) \ + .flags = type | unit | base | \ + BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ + BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ + BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ + .exponent = exp, \ + .size = 1 + +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ + { \ + { \ + STATS_DESC_COMMON(type, unit, base, exp), \ + .offset = offsetof(struct kvm_vm_stat, generic.stat) \ + }, \ + .name = #stat, \ + } +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ + { \ + { \ + STATS_DESC_COMMON(type, unit, base, exp), \ + .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ + }, \ + .name = #stat, \ + } +#define VM_STATS_DESC(stat, type, unit, base, exp) \ + { \ + { \ + STATS_DESC_COMMON(type, unit, base, exp), \ + .offset = offsetof(struct kvm_vm_stat, stat) \ + }, \ + .name = #stat, \ + } +#define VCPU_STATS_DESC(stat, type, unit, base, exp) \ + { \ + { \ + STATS_DESC_COMMON(type, unit, base, exp), \ + .offset = offsetof(struct kvm_vcpu_stat, stat) \ + }, \ + .name = #stat, \ + } +/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ +#define STATS_DESC(SCOPE, stat, type, unit, base, exp) \ + SCOPE##_STATS_DESC(stat, type, unit, base, exp) + +#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent) +#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) +#define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent) + +/* Cumulative counter, read/write */ +#define STATS_DESC_COUNTER(SCOPE, name) \ + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE, \ + KVM_STATS_BASE_POW10, 0) +/* Instantaneous counter, read only */ +#define STATS_DESC_ICOUNTER(SCOPE, name) \ + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE, \ + KVM_STATS_BASE_POW10, 0) +/* Peak counter, read/write */ +#define STATS_DESC_PCOUNTER(SCOPE, name) \ + STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ + KVM_STATS_BASE_POW10, 0) + +/* Cumulative time in nanosecond */ +#define STATS_DESC_TIME_NSEC(SCOPE, name) \ + STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9) + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; +ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, + const struct _kvm_stats_desc *desc, + void *stats, size_t size_stats, + char __user *user_buffer, size_t size, loff_t *offset); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 48db778291b7..ed6a985c5680 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -89,4 +89,6 @@ struct kvm_vcpu_stat_generic { u64 halt_poll_fail_ns; }; +#define KVM_STATS_NAME_SIZE 48 + #endif /* __KVM_TYPES_H__ */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 330835f1005b..f1ba602260f6 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1087,6 +1087,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SREGS2 200 #define KVM_CAP_EXIT_HYPERCALL 201 #define KVM_CAP_PPC_RPT_INVALIDATE 202 +#define KVM_CAP_BINARY_STATS_FD 203 #ifdef KVM_CAP_IRQ_ROUTING @@ -1906,4 +1907,76 @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +/** + * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. + * @flags: Some extra information for header, always 0 for now. + * @name_size: The size in bytes of the memory which contains statistics + * name string including trailing '\0'. The memory is allocated + * at the send of statistics descriptor. + * @num_desc: The number of statistics the vm or vcpu has. + * @id_offset: The offset of the vm/vcpu stats' id string in the file pointed + * by vm/vcpu stats fd. + * @desc_offset: The offset of the vm/vcpu stats' descriptor block in the file + * pointd by vm/vcpu stats fd. + * @data_offset: The offset of the vm/vcpu stats' data block in the file + * pointed by vm/vcpu stats fd. + * + * This is the header userspace needs to read from stats fd before any other + * readings. It is used by userspace to discover all the information about the + * vm/vcpu's binary statistics. + * Userspace reads this header from the start of the vm/vcpu's stats fd. + */ +struct kvm_stats_header { + __u32 flags; + __u32 name_size; + __u32 num_desc; + __u32 id_offset; + __u32 desc_offset; + __u32 data_offset; +}; + +#define KVM_STATS_TYPE_SHIFT 0 +#define KVM_STATS_TYPE_MASK (0xF << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK + +#define KVM_STATS_UNIT_SHIFT 4 +#define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_NONE (0x0 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES + +#define KVM_STATS_BASE_SHIFT 8 +#define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT) +#define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2 + +/** + * struct kvm_stats_desc - Descriptor of a KVM statistics. + * @flags: Annotations of the stats, like type, unit, etc. + * @exponent: Used together with @flags to determine the unit. + * @size: The number of data items for this stats. + * Every data item is of type __u64. + * @offset: The offset of the stats to the start of stat structure in + * struture kvm or kvm_vcpu. + * @unused: Unused field for future usage. Always 0 for now. + * @name: The name string for the stats. Its size is indicated by the + * &kvm_stats_header->name_size. + */ +struct kvm_stats_desc { + __u32 flags; + __s16 exponent; + __u16 size; + __u32 offset; + __u32 unused; + char name[]; +}; + +#define KVM_GET_STATS_FD _IO(KVMIO, 0xce) + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c new file mode 100644 index 000000000000..e609d428811a --- /dev/null +++ b/virt/kvm/binary_stats.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM binary statistics interface implementation + * + * Copyright 2021 Google LLC + */ + +#include +#include +#include +#include + +/** + * kvm_stats_read() - Common function to read from the binary statistics + * file descriptor. + * + * @id: identification string of the stats + * @header: stats header for a vm or a vcpu + * @desc: start address of an array of stats descriptors for a vm or a vcpu + * @stats: start address of stats data block for a vm or a vcpu + * @size_stats: the size of stats data block pointed by @stats + * @user_buffer: start address of userspace buffer + * @size: requested read size from userspace + * @offset: the start position from which the content will be read for the + * corresponding vm or vcp file descriptor + * + * The file content of a vm/vcpu file descriptor is now defined as below: + * +-------------+ + * | Header | + * +-------------+ + * | id string | + * +-------------+ + * | Descriptors | + * +-------------+ + * | Stats Data | + * +-------------+ + * Although this function allows userspace to read any amount of data (as long + * as in the limit) from any position, the typical usage would follow below + * steps: + * 1. Read header from offset 0. Get the offset of descriptors and stats data + * and some other necessary information. This is a one-time work for the + * lifecycle of the corresponding vm/vcpu stats fd. + * 2. Read id string from its offset. This is a one-time work for the lifecycle + * of the corresponding vm/vcpu stats fd. + * 3. Read descriptors from its offset and discover all the stats by parsing + * descriptors. This is a one-time work for the lifecycle of the + * corresponding vm/vcpu stats fd. + * 4. Periodically read stats data from its offset using pread. + * + * Return: the number of bytes that has been successfully read + */ +ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, + const struct _kvm_stats_desc *desc, + void *stats, size_t size_stats, + char __user *user_buffer, size_t size, loff_t *offset) +{ + ssize_t len; + ssize_t copylen; + ssize_t remain = size; + size_t size_desc; + size_t size_header; + void *src; + loff_t pos = *offset; + char __user *dest = user_buffer; + + size_header = sizeof(*header); + size_desc = header->num_desc * sizeof(*desc); + + len = KVM_STATS_NAME_SIZE + size_header + size_desc + size_stats - pos; + len = min(len, remain); + if (len <= 0) + return 0; + remain = len; + + /* + * Copy kvm stats header. + * The header is the first block of content userspace usually read out. + * The pos is 0 and the copylen and remain would be the size of header. + * The copy of the header would be skipped if offset is larger than the + * size of header. That usually happens when userspace reads stats + * descriptors and stats data. + */ + copylen = size_header - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = (void *)header + pos; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* + * Copy kvm stats header id string. + * The id string is unique for every vm/vcpu, which is stored in kvm + * and kvm_vcpu structure. + * The id string is part of the stat header from the perspective of + * userspace, it is usually read out together with previous constant + * header part and could be skipped for later descriptors and stats + * data readings. + */ + copylen = header->id_offset + KVM_STATS_NAME_SIZE - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = id + pos - header->id_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* + * Copy kvm stats descriptors. + * The descriptors copy would be skipped in the typical case that + * userspace periodically read stats data, since the pos would be + * greater than the end address of descriptors + * (header->header.desc_offset + size_desc) causing copylen <= 0. + */ + copylen = header->desc_offset + size_desc - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = (void *)desc + pos - header->desc_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + /* Copy kvm stats values */ + copylen = header->data_offset + size_stats - pos; + copylen = min(copylen, remain); + if (copylen > 0) { + src = stats + pos - header->data_offset; + if (copy_to_user(dest, src, copylen)) + return -EFAULT; + remain -= copylen; + pos += copylen; + dest += copylen; + } + + *offset = pos; + return len; +} -- cgit v1.2.3 From fcfe1baeddbf1c7c448b44c82586d0cbc8abc9f5 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 18 Jun 2021 22:27:05 +0000 Subject: KVM: stats: Support binary stats retrieval for a VM Add a VM ioctl to get a statistics file descriptor by which a read functionality is provided for userspace to read out VM stats header, descriptors and data. Define VM statistics descriptors and header for all architectures. Reviewed-by: David Matlack Reviewed-by: Ricardo Koller Reviewed-by: Krish Sadhukhan Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba #arm64 Signed-off-by: Jing Zhang Message-Id: <20210618222709.1858088-4-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 15 +++++++++++++++ arch/mips/kvm/mips.c | 15 +++++++++++++++ arch/powerpc/kvm/book3s.c | 17 +++++++++++++++++ arch/powerpc/kvm/booke.c | 17 +++++++++++++++++ arch/s390/kvm/kvm-s390.c | 20 ++++++++++++++++++++ arch/x86/kvm/x86.c | 25 +++++++++++++++++++++++++ include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 157 insertions(+) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 988ead309cbe..d7606a3c449b 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -28,6 +28,21 @@ #include "trace.h" +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS() +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2f2969aef60c..9f8b203737df 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -38,6 +38,21 @@ #define VECTORSPACING 0x100 /* for EI/VI mode */ #endif +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS() +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("wait", wait_exits), VCPU_STAT("cache", cache_exits), diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ae9f1b855ff9..1f004837f9c5 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -38,6 +38,23 @@ /* #define EXIT_DEBUG */ +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_ICOUNTER(VM, num_2M_pages), + STATS_DESC_ICOUNTER(VM, num_1G_pages) +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("exits", sum_exits), VCPU_STAT("mmio", mmio_exits), diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7a75559ab51d..a49ea4dcf963 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -36,6 +36,23 @@ unsigned long kvmppc_booke_handlers; +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_ICOUNTER(VM, num_2M_pages), + STATS_DESC_ICOUNTER(VM, num_1G_pages) +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("mmio", mmio_exits), VCPU_STAT("sig", signal_exits), diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 75ad44c44717..c7c7a28af41c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -58,6 +58,26 @@ #define VCPU_IRQS_MAX_BUF (sizeof(struct kvm_s390_irq) * \ (KVM_MAX_VCPUS + LOCAL_IRQS)) +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, inject_io), + STATS_DESC_COUNTER(VM, inject_float_mchk), + STATS_DESC_COUNTER(VM, inject_pfault_done), + STATS_DESC_COUNTER(VM, inject_service_signal), + STATS_DESC_COUNTER(VM, inject_virtio) +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("userspace_handled", exit_userspace), VCPU_STAT("exit_null", exit_null), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 71202330848a..570fd0704847 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -223,6 +223,31 @@ EXPORT_SYMBOL_GPL(host_xss); u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, mmu_shadow_zapped), + STATS_DESC_COUNTER(VM, mmu_pte_write), + STATS_DESC_COUNTER(VM, mmu_pde_zapped), + STATS_DESC_COUNTER(VM, mmu_flooded), + STATS_DESC_COUNTER(VM, mmu_recycled), + STATS_DESC_COUNTER(VM, mmu_cache_miss), + STATS_DESC_ICOUNTER(VM, mmu_unsync), + STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_ICOUNTER(VM, max_mmu_page_hash_collisions) +}; +static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == + sizeof(struct kvm_vm_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("pf_fixed", pf_fixed), VCPU_STAT("pf_guest", pf_guest), diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9ee7f350473b..e79ce64b9f6f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -599,6 +599,7 @@ struct kvm { #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; #endif + char stats_id[KVM_STATS_NAME_SIZE]; }; #define kvm_err(fmt, ...) \ @@ -1354,12 +1355,17 @@ struct _kvm_stats_desc { STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) +#define KVM_GENERIC_VM_STATS() \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); +extern const struct kvm_stats_header kvm_vm_stats_header; +extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cec986487b30..33ec43a87d0f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4055,6 +4055,42 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, } } +static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) +{ + struct kvm *kvm = file->private_data; + + return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header, + &kvm_vm_stats_desc[0], &kvm->stat, + sizeof(kvm->stat), user_buffer, size, offset); +} + +static const struct file_operations kvm_vm_stats_fops = { + .read = kvm_vm_stats_read, + .llseek = noop_llseek, +}; + +static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) +{ + int fd; + struct file *file; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_getfile("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + file->f_mode |= FMODE_PREAD; + fd_install(fd, file); + + return fd; +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4237,6 +4273,9 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_RESET_DIRTY_RINGS: r = kvm_vm_ioctl_reset_dirty_pages(kvm); break; + case KVM_GET_STATS_FD: + r = kvm_vm_ioctl_get_stats_fd(kvm); + break; default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -4316,6 +4355,9 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) if (r < 0) goto put_kvm; + snprintf(kvm->stats_id, sizeof(kvm->stats_id), + "kvm-%d", task_pid_nr(current)); + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { put_unused_fd(r); -- cgit v1.2.3 From ce55c049459cff0034cc1bcfdce3bf343a2d6317 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 18 Jun 2021 22:27:06 +0000 Subject: KVM: stats: Support binary stats retrieval for a VCPU Add a VCPU ioctl to get a statistics file descriptor by which a read functionality is provided for userspace to read out VCPU stats header, descriptors and data. Define VCPU statistics descriptors and header for all architectures. Reviewed-by: David Matlack Reviewed-by: Ricardo Koller Reviewed-by: Krish Sadhukhan Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba #arm64 Signed-off-by: Jing Zhang Message-Id: <20210618222709.1858088-5-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 21 +++++++++ arch/mips/kvm/mips.c | 44 +++++++++++++++++++ arch/powerpc/kvm/book3s.c | 45 +++++++++++++++++++ arch/powerpc/kvm/booke.c | 38 ++++++++++++++++ arch/s390/kvm/kvm-s390.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 41 ++++++++++++++++++ include/linux/kvm_host.h | 13 +++++- virt/kvm/kvm_main.c | 51 +++++++++++++++++++++- 8 files changed, 359 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index d7606a3c449b..f1dc2092d3a0 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -43,6 +43,27 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, hvc_exit_stat), + STATS_DESC_COUNTER(VCPU, wfe_exit_stat), + STATS_DESC_COUNTER(VCPU, wfi_exit_stat), + STATS_DESC_COUNTER(VCPU, mmio_exit_user), + STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, exits) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9f8b203737df..2aba78c2266d 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -53,6 +53,50 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, wait_exits), + STATS_DESC_COUNTER(VCPU, cache_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, int_exits), + STATS_DESC_COUNTER(VCPU, cop_unusable_exits), + STATS_DESC_COUNTER(VCPU, tlbmod_exits), + STATS_DESC_COUNTER(VCPU, tlbmiss_ld_exits), + STATS_DESC_COUNTER(VCPU, tlbmiss_st_exits), + STATS_DESC_COUNTER(VCPU, addrerr_st_exits), + STATS_DESC_COUNTER(VCPU, addrerr_ld_exits), + STATS_DESC_COUNTER(VCPU, syscall_exits), + STATS_DESC_COUNTER(VCPU, resvd_inst_exits), + STATS_DESC_COUNTER(VCPU, break_inst_exits), + STATS_DESC_COUNTER(VCPU, trap_inst_exits), + STATS_DESC_COUNTER(VCPU, msa_fpe_exits), + STATS_DESC_COUNTER(VCPU, fpe_exits), + STATS_DESC_COUNTER(VCPU, msa_disabled_exits), + STATS_DESC_COUNTER(VCPU, flush_dcache_exits), + STATS_DESC_COUNTER(VCPU, vz_gpsi_exits), + STATS_DESC_COUNTER(VCPU, vz_gsfc_exits), + STATS_DESC_COUNTER(VCPU, vz_hc_exits), + STATS_DESC_COUNTER(VCPU, vz_grr_exits), + STATS_DESC_COUNTER(VCPU, vz_gva_exits), + STATS_DESC_COUNTER(VCPU, vz_ghfc_exits), + STATS_DESC_COUNTER(VCPU, vz_gpa_exits), + STATS_DESC_COUNTER(VCPU, vz_resvd_exits), +#ifdef CONFIG_CPU_LOONGSON64 + STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits), +#endif +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("wait", wait_exits), VCPU_STAT("cache", cache_exits), diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 1f004837f9c5..61229302bce2 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -55,6 +55,51 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, sum_exits), + STATS_DESC_COUNTER(VCPU, mmio_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, light_exits), + STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits), + STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits), + STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits), + STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits), + STATS_DESC_COUNTER(VCPU, syscall_exits), + STATS_DESC_COUNTER(VCPU, isi_exits), + STATS_DESC_COUNTER(VCPU, dsi_exits), + STATS_DESC_COUNTER(VCPU, emulated_inst_exits), + STATS_DESC_COUNTER(VCPU, dec_exits), + STATS_DESC_COUNTER(VCPU, ext_intr_exits), + STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), + STATS_DESC_COUNTER(VCPU, halt_successful_wait), + STATS_DESC_COUNTER(VCPU, dbell_exits), + STATS_DESC_COUNTER(VCPU, gdbell_exits), + STATS_DESC_COUNTER(VCPU, ld), + STATS_DESC_COUNTER(VCPU, st), + STATS_DESC_COUNTER(VCPU, pf_storage), + STATS_DESC_COUNTER(VCPU, pf_instruc), + STATS_DESC_COUNTER(VCPU, sp_storage), + STATS_DESC_COUNTER(VCPU, sp_instruc), + STATS_DESC_COUNTER(VCPU, queue_intr), + STATS_DESC_COUNTER(VCPU, ld_slow), + STATS_DESC_COUNTER(VCPU, st_slow), + STATS_DESC_COUNTER(VCPU, pthru_all), + STATS_DESC_COUNTER(VCPU, pthru_host), + STATS_DESC_COUNTER(VCPU, pthru_bad_aff) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("exits", sum_exits), VCPU_STAT("mmio", mmio_exits), diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index a49ea4dcf963..6e8de33bc138 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -53,6 +53,44 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, sum_exits), + STATS_DESC_COUNTER(VCPU, mmio_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, light_exits), + STATS_DESC_COUNTER(VCPU, itlb_real_miss_exits), + STATS_DESC_COUNTER(VCPU, itlb_virt_miss_exits), + STATS_DESC_COUNTER(VCPU, dtlb_real_miss_exits), + STATS_DESC_COUNTER(VCPU, dtlb_virt_miss_exits), + STATS_DESC_COUNTER(VCPU, syscall_exits), + STATS_DESC_COUNTER(VCPU, isi_exits), + STATS_DESC_COUNTER(VCPU, dsi_exits), + STATS_DESC_COUNTER(VCPU, emulated_inst_exits), + STATS_DESC_COUNTER(VCPU, dec_exits), + STATS_DESC_COUNTER(VCPU, ext_intr_exits), + STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), + STATS_DESC_COUNTER(VCPU, halt_successful_wait), + STATS_DESC_COUNTER(VCPU, dbell_exits), + STATS_DESC_COUNTER(VCPU, gdbell_exits), + STATS_DESC_COUNTER(VCPU, ld), + STATS_DESC_COUNTER(VCPU, st), + STATS_DESC_COUNTER(VCPU, pthru_all), + STATS_DESC_COUNTER(VCPU, pthru_host), + STATS_DESC_COUNTER(VCPU, pthru_bad_aff) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("mmio", mmio_exits), VCPU_STAT("sig", signal_exits), diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c7c7a28af41c..8ac10bcaf8ba 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -78,6 +78,114 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, exit_userspace), + STATS_DESC_COUNTER(VCPU, exit_null), + STATS_DESC_COUNTER(VCPU, exit_external_request), + STATS_DESC_COUNTER(VCPU, exit_io_request), + STATS_DESC_COUNTER(VCPU, exit_external_interrupt), + STATS_DESC_COUNTER(VCPU, exit_stop_request), + STATS_DESC_COUNTER(VCPU, exit_validity), + STATS_DESC_COUNTER(VCPU, exit_instruction), + STATS_DESC_COUNTER(VCPU, exit_pei), + STATS_DESC_COUNTER(VCPU, halt_no_poll_steal), + STATS_DESC_COUNTER(VCPU, instruction_lctl), + STATS_DESC_COUNTER(VCPU, instruction_lctlg), + STATS_DESC_COUNTER(VCPU, instruction_stctl), + STATS_DESC_COUNTER(VCPU, instruction_stctg), + STATS_DESC_COUNTER(VCPU, exit_program_interruption), + STATS_DESC_COUNTER(VCPU, exit_instr_and_program), + STATS_DESC_COUNTER(VCPU, exit_operation_exception), + STATS_DESC_COUNTER(VCPU, deliver_ckc), + STATS_DESC_COUNTER(VCPU, deliver_cputm), + STATS_DESC_COUNTER(VCPU, deliver_external_call), + STATS_DESC_COUNTER(VCPU, deliver_emergency_signal), + STATS_DESC_COUNTER(VCPU, deliver_service_signal), + STATS_DESC_COUNTER(VCPU, deliver_virtio), + STATS_DESC_COUNTER(VCPU, deliver_stop_signal), + STATS_DESC_COUNTER(VCPU, deliver_prefix_signal), + STATS_DESC_COUNTER(VCPU, deliver_restart_signal), + STATS_DESC_COUNTER(VCPU, deliver_program), + STATS_DESC_COUNTER(VCPU, deliver_io), + STATS_DESC_COUNTER(VCPU, deliver_machine_check), + STATS_DESC_COUNTER(VCPU, exit_wait_state), + STATS_DESC_COUNTER(VCPU, inject_ckc), + STATS_DESC_COUNTER(VCPU, inject_cputm), + STATS_DESC_COUNTER(VCPU, inject_external_call), + STATS_DESC_COUNTER(VCPU, inject_emergency_signal), + STATS_DESC_COUNTER(VCPU, inject_mchk), + STATS_DESC_COUNTER(VCPU, inject_pfault_init), + STATS_DESC_COUNTER(VCPU, inject_program), + STATS_DESC_COUNTER(VCPU, inject_restart), + STATS_DESC_COUNTER(VCPU, inject_set_prefix), + STATS_DESC_COUNTER(VCPU, inject_stop_signal), + STATS_DESC_COUNTER(VCPU, instruction_epsw), + STATS_DESC_COUNTER(VCPU, instruction_gs), + STATS_DESC_COUNTER(VCPU, instruction_io_other), + STATS_DESC_COUNTER(VCPU, instruction_lpsw), + STATS_DESC_COUNTER(VCPU, instruction_lpswe), + STATS_DESC_COUNTER(VCPU, instruction_pfmf), + STATS_DESC_COUNTER(VCPU, instruction_ptff), + STATS_DESC_COUNTER(VCPU, instruction_sck), + STATS_DESC_COUNTER(VCPU, instruction_sckpf), + STATS_DESC_COUNTER(VCPU, instruction_stidp), + STATS_DESC_COUNTER(VCPU, instruction_spx), + STATS_DESC_COUNTER(VCPU, instruction_stpx), + STATS_DESC_COUNTER(VCPU, instruction_stap), + STATS_DESC_COUNTER(VCPU, instruction_iske), + STATS_DESC_COUNTER(VCPU, instruction_ri), + STATS_DESC_COUNTER(VCPU, instruction_rrbe), + STATS_DESC_COUNTER(VCPU, instruction_sske), + STATS_DESC_COUNTER(VCPU, instruction_ipte_interlock), + STATS_DESC_COUNTER(VCPU, instruction_stsi), + STATS_DESC_COUNTER(VCPU, instruction_stfl), + STATS_DESC_COUNTER(VCPU, instruction_tb), + STATS_DESC_COUNTER(VCPU, instruction_tpi), + STATS_DESC_COUNTER(VCPU, instruction_tprot), + STATS_DESC_COUNTER(VCPU, instruction_tsch), + STATS_DESC_COUNTER(VCPU, instruction_sie), + STATS_DESC_COUNTER(VCPU, instruction_essa), + STATS_DESC_COUNTER(VCPU, instruction_sthyi), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense), + STATS_DESC_COUNTER(VCPU, instruction_sigp_sense_running), + STATS_DESC_COUNTER(VCPU, instruction_sigp_external_call), + STATS_DESC_COUNTER(VCPU, instruction_sigp_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cond_emergency), + STATS_DESC_COUNTER(VCPU, instruction_sigp_start), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop), + STATS_DESC_COUNTER(VCPU, instruction_sigp_stop_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_store_adtl_status), + STATS_DESC_COUNTER(VCPU, instruction_sigp_arch), + STATS_DESC_COUNTER(VCPU, instruction_sigp_prefix), + STATS_DESC_COUNTER(VCPU, instruction_sigp_restart), + STATS_DESC_COUNTER(VCPU, instruction_sigp_init_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_cpu_reset), + STATS_DESC_COUNTER(VCPU, instruction_sigp_unknown), + STATS_DESC_COUNTER(VCPU, diagnose_10), + STATS_DESC_COUNTER(VCPU, diagnose_44), + STATS_DESC_COUNTER(VCPU, diagnose_9c), + STATS_DESC_COUNTER(VCPU, diagnose_9c_ignored), + STATS_DESC_COUNTER(VCPU, diagnose_9c_forward), + STATS_DESC_COUNTER(VCPU, diagnose_258), + STATS_DESC_COUNTER(VCPU, diagnose_308), + STATS_DESC_COUNTER(VCPU, diagnose_500), + STATS_DESC_COUNTER(VCPU, diagnose_other), + STATS_DESC_COUNTER(VCPU, pfault_sync) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("userspace_handled", exit_userspace), VCPU_STAT("exit_null", exit_null), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 570fd0704847..53b7c25d6ebc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -248,6 +248,47 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, pf_fixed), + STATS_DESC_COUNTER(VCPU, pf_guest), + STATS_DESC_COUNTER(VCPU, tlb_flush), + STATS_DESC_COUNTER(VCPU, invlpg), + STATS_DESC_COUNTER(VCPU, exits), + STATS_DESC_COUNTER(VCPU, io_exits), + STATS_DESC_COUNTER(VCPU, mmio_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, irq_window_exits), + STATS_DESC_COUNTER(VCPU, nmi_window_exits), + STATS_DESC_COUNTER(VCPU, l1d_flush), + STATS_DESC_COUNTER(VCPU, halt_exits), + STATS_DESC_COUNTER(VCPU, request_irq_exits), + STATS_DESC_COUNTER(VCPU, irq_exits), + STATS_DESC_COUNTER(VCPU, host_state_reload), + STATS_DESC_COUNTER(VCPU, fpu_reload), + STATS_DESC_COUNTER(VCPU, insn_emulation), + STATS_DESC_COUNTER(VCPU, insn_emulation_fail), + STATS_DESC_COUNTER(VCPU, hypercalls), + STATS_DESC_COUNTER(VCPU, irq_injections), + STATS_DESC_COUNTER(VCPU, nmi_injections), + STATS_DESC_COUNTER(VCPU, req_event), + STATS_DESC_COUNTER(VCPU, nested_run), + STATS_DESC_COUNTER(VCPU, directed_yield_attempted), + STATS_DESC_COUNTER(VCPU, directed_yield_successful), + STATS_DESC_ICOUNTER(VCPU, guest_mode) +}; +static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == + sizeof(struct kvm_vcpu_stat) / sizeof(u64)); + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), +}; + struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("pf_fixed", pf_fixed), VCPU_STAT("pf_guest", pf_guest), diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e79ce64b9f6f..9e75afef16b0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -305,7 +305,6 @@ struct kvm_vcpu { struct pid __rcu *pid; int sigset_active; sigset_t sigset; - struct kvm_vcpu_stat stat; unsigned int halt_poll_ns; bool valid_wakeup; @@ -342,6 +341,8 @@ struct kvm_vcpu { bool preempted; bool ready; struct kvm_vcpu_arch arch; + struct kvm_vcpu_stat stat; + char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; }; @@ -1358,6 +1359,14 @@ struct _kvm_stats_desc { #define KVM_GENERIC_VM_STATS() \ STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) +#define KVM_GENERIC_VCPU_STATS() \ + STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ + STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll), \ + STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ + STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) + extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, @@ -1366,6 +1375,8 @@ ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, char __user *user_buffer, size_t size, loff_t *offset); extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; +extern const struct kvm_stats_header kvm_vcpu_stats_header; +extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 33ec43a87d0f..c8d0028df4ac 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3448,6 +3448,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); + /* Fill the stats id string for the vcpu */ + snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", + task_pid_nr(current), id); + /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); @@ -3497,6 +3501,44 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) return 0; } +static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer, + size_t size, loff_t *offset) +{ + struct kvm_vcpu *vcpu = file->private_data; + + return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header, + &kvm_vcpu_stats_desc[0], &vcpu->stat, + sizeof(vcpu->stat), user_buffer, size, offset); +} + +static const struct file_operations kvm_vcpu_stats_fops = { + .read = kvm_vcpu_stats_read, + .llseek = noop_llseek, +}; + +static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) +{ + int fd; + struct file *file; + char name[15 + ITOA_MAX_LEN + 1]; + + snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id); + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + file->f_mode |= FMODE_PREAD; + fd_install(fd, file); + + return fd; +} + static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3694,6 +3736,10 @@ out_free1: r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); break; } + case KVM_GET_STATS_FD: { + r = kvm_vcpu_ioctl_get_stats_fd(vcpu); + break; + } default: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } @@ -3952,6 +3998,8 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #else return 0; #endif + case KVM_CAP_BINARY_STATS_FD: + return 1; default: break; } @@ -5254,7 +5302,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, SLAB_ACCOUNT, offsetof(struct kvm_vcpu, arch), - sizeof_field(struct kvm_vcpu, arch), + offsetofend(struct kvm_vcpu, stats_id) + - offsetof(struct kvm_vcpu, arch), NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; -- cgit v1.2.3 From bc9e9e672df9f16f3825320c53ec01b3d44add28 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Wed, 23 Jun 2021 17:28:46 -0400 Subject: KVM: debugfs: Reuse binary stats descriptors To remove code duplication, use the binary stats descriptors in the implementation of the debugfs interface for statistics. This unifies the definition of statistics for the binary and debugfs interfaces. Signed-off-by: Jing Zhang Message-Id: <20210618222709.1858088-8-jingzhangos@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 16 ------- arch/mips/kvm/mips.c | 39 ----------------- arch/powerpc/kvm/book3s.c | 33 -------------- arch/powerpc/kvm/booke.c | 25 ----------- arch/s390/kvm/kvm-s390.c | 108 ---------------------------------------------- arch/x86/kvm/x86.c | 49 +-------------------- include/linux/kvm_host.h | 17 +------- virt/kvm/kvm_main.c | 104 ++++++++++++++++++++++++++++++++------------ 8 files changed, 78 insertions(+), 313 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index f1dc2092d3a0..1512a8007a78 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -64,22 +64,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("hvc_exit_stat", hvc_exit_stat), - VCPU_STAT("wfe_exit_stat", wfe_exit_stat), - VCPU_STAT("wfi_exit_stat", wfi_exit_stat), - VCPU_STAT("mmio_exit_user", mmio_exit_user), - VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), - VCPU_STAT("exits", exits), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - { NULL } -}; - static bool core_reg_offset_is_vreg(u64 off) { return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) && diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2aba78c2266d..af9dd029a4e1 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -97,45 +97,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("wait", wait_exits), - VCPU_STAT("cache", cache_exits), - VCPU_STAT("signal", signal_exits), - VCPU_STAT("interrupt", int_exits), - VCPU_STAT("cop_unusable", cop_unusable_exits), - VCPU_STAT("tlbmod", tlbmod_exits), - VCPU_STAT("tlbmiss_ld", tlbmiss_ld_exits), - VCPU_STAT("tlbmiss_st", tlbmiss_st_exits), - VCPU_STAT("addrerr_st", addrerr_st_exits), - VCPU_STAT("addrerr_ld", addrerr_ld_exits), - VCPU_STAT("syscall", syscall_exits), - VCPU_STAT("resvd_inst", resvd_inst_exits), - VCPU_STAT("break_inst", break_inst_exits), - VCPU_STAT("trap_inst", trap_inst_exits), - VCPU_STAT("msa_fpe", msa_fpe_exits), - VCPU_STAT("fpe", fpe_exits), - VCPU_STAT("msa_disabled", msa_disabled_exits), - VCPU_STAT("flush_dcache", flush_dcache_exits), - VCPU_STAT("vz_gpsi", vz_gpsi_exits), - VCPU_STAT("vz_gsfc", vz_gsfc_exits), - VCPU_STAT("vz_hc", vz_hc_exits), - VCPU_STAT("vz_grr", vz_grr_exits), - VCPU_STAT("vz_gva", vz_gva_exits), - VCPU_STAT("vz_ghfc", vz_ghfc_exits), - VCPU_STAT("vz_gpa", vz_gpa_exits), - VCPU_STAT("vz_resvd", vz_resvd_exits), -#ifdef CONFIG_CPU_LOONGSON64 - VCPU_STAT("vz_cpucfg", vz_cpucfg_exits), -#endif - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - {NULL} -}; - bool kvm_trace_guest_mode_change; int kvm_guest_mode_change_trace_reg(void) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 61229302bce2..79833f78d1da 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -100,39 +100,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("exits", sum_exits), - VCPU_STAT("mmio", mmio_exits), - VCPU_STAT("sig", signal_exits), - VCPU_STAT("sysc", syscall_exits), - VCPU_STAT("inst_emu", emulated_inst_exits), - VCPU_STAT("dec", dec_exits), - VCPU_STAT("ext_intr", ext_intr_exits), - VCPU_STAT("queue_intr", queue_intr), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - VCPU_STAT("halt_wait_ns", halt_wait_ns), - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_successful_wait", halt_successful_wait), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("pf_storage", pf_storage), - VCPU_STAT("sp_storage", sp_storage), - VCPU_STAT("pf_instruc", pf_instruc), - VCPU_STAT("sp_instruc", sp_instruc), - VCPU_STAT("ld", ld), - VCPU_STAT("ld_slow", ld_slow), - VCPU_STAT("st", st), - VCPU_STAT("st_slow", st_slow), - VCPU_STAT("pthru_all", pthru_all), - VCPU_STAT("pthru_host", pthru_host), - VCPU_STAT("pthru_bad_aff", pthru_bad_aff), - VM_STAT("largepages_2M", num_2M_pages, .mode = 0444), - VM_STAT("largepages_1G", num_1G_pages, .mode = 0444), - { NULL } -}; - static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 6e8de33bc138..551b30d84aee 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -91,31 +91,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("mmio", mmio_exits), - VCPU_STAT("sig", signal_exits), - VCPU_STAT("itlb_r", itlb_real_miss_exits), - VCPU_STAT("itlb_v", itlb_virt_miss_exits), - VCPU_STAT("dtlb_r", dtlb_real_miss_exits), - VCPU_STAT("dtlb_v", dtlb_virt_miss_exits), - VCPU_STAT("sysc", syscall_exits), - VCPU_STAT("isi", isi_exits), - VCPU_STAT("dsi", dsi_exits), - VCPU_STAT("inst_emu", emulated_inst_exits), - VCPU_STAT("dec", dec_exits), - VCPU_STAT("ext_intr", ext_intr_exits), - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("doorbell", dbell_exits), - VCPU_STAT("guest doorbell", gdbell_exits), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - VM_STAT("remote_tlb_flush", generic.remote_tlb_flush), - { NULL } -}; - /* TODO: use vcpu_printf() */ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8ac10bcaf8ba..1695f0ced5ba 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -186,114 +186,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("userspace_handled", exit_userspace), - VCPU_STAT("exit_null", exit_null), - VCPU_STAT("pfault_sync", pfault_sync), - VCPU_STAT("exit_validity", exit_validity), - VCPU_STAT("exit_stop_request", exit_stop_request), - VCPU_STAT("exit_external_request", exit_external_request), - VCPU_STAT("exit_io_request", exit_io_request), - VCPU_STAT("exit_external_interrupt", exit_external_interrupt), - VCPU_STAT("exit_instruction", exit_instruction), - VCPU_STAT("exit_pei", exit_pei), - VCPU_STAT("exit_program_interruption", exit_program_interruption), - VCPU_STAT("exit_instr_and_program_int", exit_instr_and_program), - VCPU_STAT("exit_operation_exception", exit_operation_exception), - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - VCPU_STAT("instruction_lctlg", instruction_lctlg), - VCPU_STAT("instruction_lctl", instruction_lctl), - VCPU_STAT("instruction_stctl", instruction_stctl), - VCPU_STAT("instruction_stctg", instruction_stctg), - VCPU_STAT("deliver_ckc", deliver_ckc), - VCPU_STAT("deliver_cputm", deliver_cputm), - VCPU_STAT("deliver_emergency_signal", deliver_emergency_signal), - VCPU_STAT("deliver_external_call", deliver_external_call), - VCPU_STAT("deliver_service_signal", deliver_service_signal), - VCPU_STAT("deliver_virtio", deliver_virtio), - VCPU_STAT("deliver_stop_signal", deliver_stop_signal), - VCPU_STAT("deliver_prefix_signal", deliver_prefix_signal), - VCPU_STAT("deliver_restart_signal", deliver_restart_signal), - VCPU_STAT("deliver_program", deliver_program), - VCPU_STAT("deliver_io", deliver_io), - VCPU_STAT("deliver_machine_check", deliver_machine_check), - VCPU_STAT("exit_wait_state", exit_wait_state), - VCPU_STAT("inject_ckc", inject_ckc), - VCPU_STAT("inject_cputm", inject_cputm), - VCPU_STAT("inject_external_call", inject_external_call), - VM_STAT("inject_float_mchk", inject_float_mchk), - VCPU_STAT("inject_emergency_signal", inject_emergency_signal), - VM_STAT("inject_io", inject_io), - VCPU_STAT("inject_mchk", inject_mchk), - VM_STAT("inject_pfault_done", inject_pfault_done), - VCPU_STAT("inject_program", inject_program), - VCPU_STAT("inject_restart", inject_restart), - VM_STAT("inject_service_signal", inject_service_signal), - VCPU_STAT("inject_set_prefix", inject_set_prefix), - VCPU_STAT("inject_stop_signal", inject_stop_signal), - VCPU_STAT("inject_pfault_init", inject_pfault_init), - VM_STAT("inject_virtio", inject_virtio), - VCPU_STAT("instruction_epsw", instruction_epsw), - VCPU_STAT("instruction_gs", instruction_gs), - VCPU_STAT("instruction_io_other", instruction_io_other), - VCPU_STAT("instruction_lpsw", instruction_lpsw), - VCPU_STAT("instruction_lpswe", instruction_lpswe), - VCPU_STAT("instruction_pfmf", instruction_pfmf), - VCPU_STAT("instruction_ptff", instruction_ptff), - VCPU_STAT("instruction_stidp", instruction_stidp), - VCPU_STAT("instruction_sck", instruction_sck), - VCPU_STAT("instruction_sckpf", instruction_sckpf), - VCPU_STAT("instruction_spx", instruction_spx), - VCPU_STAT("instruction_stpx", instruction_stpx), - VCPU_STAT("instruction_stap", instruction_stap), - VCPU_STAT("instruction_iske", instruction_iske), - VCPU_STAT("instruction_ri", instruction_ri), - VCPU_STAT("instruction_rrbe", instruction_rrbe), - VCPU_STAT("instruction_sske", instruction_sske), - VCPU_STAT("instruction_ipte_interlock", instruction_ipte_interlock), - VCPU_STAT("instruction_essa", instruction_essa), - VCPU_STAT("instruction_stsi", instruction_stsi), - VCPU_STAT("instruction_stfl", instruction_stfl), - VCPU_STAT("instruction_tb", instruction_tb), - VCPU_STAT("instruction_tpi", instruction_tpi), - VCPU_STAT("instruction_tprot", instruction_tprot), - VCPU_STAT("instruction_tsch", instruction_tsch), - VCPU_STAT("instruction_sthyi", instruction_sthyi), - VCPU_STAT("instruction_sie", instruction_sie), - VCPU_STAT("instruction_sigp_sense", instruction_sigp_sense), - VCPU_STAT("instruction_sigp_sense_running", instruction_sigp_sense_running), - VCPU_STAT("instruction_sigp_external_call", instruction_sigp_external_call), - VCPU_STAT("instruction_sigp_emergency", instruction_sigp_emergency), - VCPU_STAT("instruction_sigp_cond_emergency", instruction_sigp_cond_emergency), - VCPU_STAT("instruction_sigp_start", instruction_sigp_start), - VCPU_STAT("instruction_sigp_stop", instruction_sigp_stop), - VCPU_STAT("instruction_sigp_stop_store_status", instruction_sigp_stop_store_status), - VCPU_STAT("instruction_sigp_store_status", instruction_sigp_store_status), - VCPU_STAT("instruction_sigp_store_adtl_status", instruction_sigp_store_adtl_status), - VCPU_STAT("instruction_sigp_set_arch", instruction_sigp_arch), - VCPU_STAT("instruction_sigp_set_prefix", instruction_sigp_prefix), - VCPU_STAT("instruction_sigp_restart", instruction_sigp_restart), - VCPU_STAT("instruction_sigp_cpu_reset", instruction_sigp_cpu_reset), - VCPU_STAT("instruction_sigp_init_cpu_reset", instruction_sigp_init_cpu_reset), - VCPU_STAT("instruction_sigp_unknown", instruction_sigp_unknown), - VCPU_STAT("instruction_diag_10", diagnose_10), - VCPU_STAT("instruction_diag_44", diagnose_44), - VCPU_STAT("instruction_diag_9c", diagnose_9c), - VCPU_STAT("diag_9c_ignored", diagnose_9c_ignored), - VCPU_STAT("diag_9c_forward", diagnose_9c_forward), - VCPU_STAT("instruction_diag_258", diagnose_258), - VCPU_STAT("instruction_diag_308", diagnose_308), - VCPU_STAT("instruction_diag_500", diagnose_500), - VCPU_STAT("instruction_diag_other", diagnose_other), - { NULL } -}; - /* allow nested virtualization in KVM (if enabled by user space) */ static int nested; module_param(nested, int, S_IRUGO); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 53b7c25d6ebc..5833b8780808 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -234,7 +234,7 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, mmu_unsync), STATS_DESC_ICOUNTER(VM, lpages), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), - STATS_DESC_ICOUNTER(VM, max_mmu_page_hash_collisions) + STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == sizeof(struct kvm_vm_stat) / sizeof(u64)); @@ -289,53 +289,6 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT("pf_fixed", pf_fixed), - VCPU_STAT("pf_guest", pf_guest), - VCPU_STAT("tlb_flush", tlb_flush), - VCPU_STAT("invlpg", invlpg), - VCPU_STAT("exits", exits), - VCPU_STAT("io_exits", io_exits), - VCPU_STAT("mmio_exits", mmio_exits), - VCPU_STAT("signal_exits", signal_exits), - VCPU_STAT("irq_window", irq_window_exits), - VCPU_STAT("nmi_window", nmi_window_exits), - VCPU_STAT("halt_exits", halt_exits), - VCPU_STAT("halt_successful_poll", generic.halt_successful_poll), - VCPU_STAT("halt_attempted_poll", generic.halt_attempted_poll), - VCPU_STAT("halt_poll_invalid", generic.halt_poll_invalid), - VCPU_STAT("halt_wakeup", generic.halt_wakeup), - VCPU_STAT("hypercalls", hypercalls), - VCPU_STAT("request_irq", request_irq_exits), - VCPU_STAT("irq_exits", irq_exits), - VCPU_STAT("host_state_reload", host_state_reload), - VCPU_STAT("fpu_reload", fpu_reload), - VCPU_STAT("insn_emulation", insn_emulation), - VCPU_STAT("insn_emulation_fail", insn_emulation_fail), - VCPU_STAT("irq_injections", irq_injections), - VCPU_STAT("nmi_injections", nmi_injections), - VCPU_STAT("req_event", req_event), - VCPU_STAT("l1d_flush", l1d_flush), - VCPU_STAT("halt_poll_success_ns", generic.halt_poll_success_ns), - VCPU_STAT("halt_poll_fail_ns", generic.halt_poll_fail_ns), - VCPU_STAT("nested_run", nested_run), - VCPU_STAT("directed_yield_attempted", directed_yield_attempted), - VCPU_STAT("directed_yield_successful", directed_yield_successful), - VCPU_STAT("guest_mode", guest_mode), - VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), - VM_STAT("mmu_pte_write", mmu_pte_write), - VM_STAT("mmu_pde_zapped", mmu_pde_zapped), - VM_STAT("mmu_flooded", mmu_flooded), - VM_STAT("mmu_recycled", mmu_recycled), - VM_STAT("mmu_cache_miss", mmu_cache_miss), - VM_STAT("mmu_unsync", mmu_unsync), - VM_STAT("remote_tlb_flush", generic.remote_tlb_flush), - VM_STAT("largepages", lpages, .mode = 0444), - VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), - VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), - { NULL } -}; - u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9e75afef16b0..ae7735b490b4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1264,14 +1264,8 @@ enum kvm_stat_kind { struct kvm_stat_data { struct kvm *kvm; - struct kvm_stats_debugfs_item *dbgfs_item; -}; - -struct kvm_stats_debugfs_item { - const char *name; - int offset; + const struct _kvm_stats_desc *desc; enum kvm_stat_kind kind; - int mode; }; struct _kvm_stats_desc { @@ -1279,14 +1273,6 @@ struct _kvm_stats_desc { char name[KVM_STATS_NAME_SIZE]; }; -#define KVM_DBGFS_GET_MODE(dbgfs_item) \ - ((dbgfs_item)->mode ? (dbgfs_item)->mode : 0644) - -#define VM_STAT(n, x, ...) \ - { n, offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ } -#define VCPU_STAT(n, x, ...) \ - { n, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ } - #define STATS_DESC_COMMON(type, unit, base, exp) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ @@ -1367,7 +1353,6 @@ struct _kvm_stats_desc { STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) -extern struct kvm_stats_debugfs_item debugfs_entries[]; extern struct dentry *kvm_debugfs_dir; ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c8d0028df4ac..3dcc2abbfc60 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -115,7 +115,6 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); struct dentry *kvm_debugfs_dir; EXPORT_SYMBOL_GPL(kvm_debugfs_dir); -static int kvm_debugfs_num_entries; static const struct file_operations stat_fops_per_vm; static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, @@ -860,9 +859,24 @@ static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) kvfree(slots); } +static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) +{ + switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_INSTANT: + return 0444; + case KVM_STATS_TYPE_CUMULATIVE: + case KVM_STATS_TYPE_PEAK: + default: + return 0644; + } +} + + static void kvm_destroy_vm_debugfs(struct kvm *kvm) { int i; + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; if (!kvm->debugfs_dentry) return; @@ -880,7 +894,10 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) { char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; - struct kvm_stats_debugfs_item *p; + const struct _kvm_stats_desc *pdesc; + int i; + int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + + kvm_vcpu_stats_header.num_desc; if (!debugfs_initialized()) return 0; @@ -894,15 +911,32 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) if (!kvm->debugfs_stat_data) return -ENOMEM; - for (p = debugfs_entries; p->name; p++) { + for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { + pdesc = &kvm_vm_stats_desc[i]; stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) return -ENOMEM; stat_data->kvm = kvm; - stat_data->dbgfs_item = p; - kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; - debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), + stat_data->desc = pdesc; + stat_data->kind = KVM_STAT_VM; + kvm->debugfs_stat_data[i] = stat_data; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm->debugfs_dentry, stat_data, + &stat_fops_per_vm); + } + + for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { + pdesc = &kvm_vcpu_stats_desc[i]; + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); + if (!stat_data) + return -ENOMEM; + + stat_data->kvm = kvm; + stat_data->desc = pdesc; + stat_data->kind = KVM_STAT_VCPU; + kvm->debugfs_stat_data[i] = stat_data; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } @@ -4900,7 +4934,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, return -ENOENT; if (simple_attr_open(inode, file, get, - KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 + kvm_stats_debugfs_mode(stat_data->desc) & 0222 ? set : NULL, fmt)) { kvm_put_kvm(stat_data->kvm); @@ -4923,14 +4957,14 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file) static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) { - *val = *(u64 *)((void *)kvm + offset); + *val = *(u64 *)((void *)(&kvm->stat) + offset); return 0; } static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) { - *(u64 *)((void *)kvm + offset) = 0; + *(u64 *)((void *)(&kvm->stat) + offset) = 0; return 0; } @@ -4943,7 +4977,7 @@ static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) *val = 0; kvm_for_each_vcpu(i, vcpu, kvm) - *val += *(u64 *)((void *)vcpu + offset); + *val += *(u64 *)((void *)(&vcpu->stat) + offset); return 0; } @@ -4954,7 +4988,7 @@ static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) - *(u64 *)((void *)vcpu + offset) = 0; + *(u64 *)((void *)(&vcpu->stat) + offset) = 0; return 0; } @@ -4964,14 +4998,14 @@ static int kvm_stat_data_get(void *data, u64 *val) int r = -EFAULT; struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; - switch (stat_data->dbgfs_item->kind) { + switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_get_stat_per_vm(stat_data->kvm, - stat_data->dbgfs_item->offset, val); + stat_data->desc->desc.offset, val); break; case KVM_STAT_VCPU: r = kvm_get_stat_per_vcpu(stat_data->kvm, - stat_data->dbgfs_item->offset, val); + stat_data->desc->desc.offset, val); break; } @@ -4986,14 +5020,14 @@ static int kvm_stat_data_clear(void *data, u64 val) if (val) return -EINVAL; - switch (stat_data->dbgfs_item->kind) { + switch (stat_data->kind) { case KVM_STAT_VM: r = kvm_clear_stat_per_vm(stat_data->kvm, - stat_data->dbgfs_item->offset); + stat_data->desc->desc.offset); break; case KVM_STAT_VCPU: r = kvm_clear_stat_per_vcpu(stat_data->kvm, - stat_data->dbgfs_item->offset); + stat_data->desc->desc.offset); break; } @@ -5050,6 +5084,7 @@ static int vm_stat_clear(void *_offset, u64 val) } DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n"); static int vcpu_stat_get(void *_offset, u64 *val) { @@ -5086,11 +5121,7 @@ static int vcpu_stat_clear(void *_offset, u64 val) DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, "%llu\n"); - -static const struct file_operations *stat_fops[] = { - [KVM_STAT_VCPU] = &vcpu_stat_fops, - [KVM_STAT_VM] = &vm_stat_fops, -}; +DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n"); static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) { @@ -5144,15 +5175,32 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) static void kvm_init_debug(void) { - struct kvm_stats_debugfs_item *p; + const struct file_operations *fops; + const struct _kvm_stats_desc *pdesc; + int i; kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); - kvm_debugfs_num_entries = 0; - for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { - debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), - kvm_debugfs_dir, (void *)(long)p->offset, - stat_fops[p->kind]); + for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) { + pdesc = &kvm_vm_stats_desc[i]; + if (kvm_stats_debugfs_mode(pdesc) & 0222) + fops = &vm_stat_fops; + else + fops = &vm_stat_readonly_fops; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm_debugfs_dir, + (void *)(long)pdesc->desc.offset, fops); + } + + for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) { + pdesc = &kvm_vcpu_stats_desc[i]; + if (kvm_stats_debugfs_mode(pdesc) & 0222) + fops = &vcpu_stat_fops; + else + fops = &vcpu_stat_readonly_fops; + debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc), + kvm_debugfs_dir, + (void *)(long)pdesc->desc.offset, fops); } } -- cgit v1.2.3 From f0d4379087d8a83f478b371ff7786e8df0cc2314 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:46 -0700 Subject: KVM: x86/mmu: Remove broken WARN that fires on 32-bit KVM w/ nested EPT Remove a misguided WARN that attempts to detect the scenario where using a special A/D tracking flag will set reserved bits on a non-MMIO spte. The WARN triggers false positives when using EPT with 32-bit KVM because of the !64-bit clause, which is just flat out wrong. The whole A/D tracking goo is specific to EPT, and one of the big selling points of EPT is that EPT is decoupled from the host's native paging mode. Drop the WARN instead of trying to salvage the check. Keeping a check specific to A/D tracking bits would essentially regurgitate the same code that led to KVM needed the tracking bits in the first place. A better approach would be to add a generic WARN on reserved bits being set, which would naturally cover the A/D tracking bits, work for all flavors of paging, and be self-documenting to some extent. Fixes: 8a406c89532c ("KVM: x86/mmu: Rename and document A/D scheme for TDP SPTEs") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 66d43cec0c31..8e8e8da740a0 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -102,13 +102,6 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; - /* - * Bits 62:52 of PAE SPTEs are reserved. WARN if said bits are set - * if PAE paging may be employed (shadow paging or any 32-bit KVM). - */ - WARN_ON_ONCE((!tdp_enabled || !IS_ENABLED(CONFIG_X86_64)) && - (spte & SPTE_TDP_AD_MASK)); - /* * For the EPT case, shadow_present_mask is 0 if hardware * supports exec-only page table entries. In that case, -- cgit v1.2.3 From 112022bdb5bc372e00e6e43cb88ee38ea67b97bd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:47 -0700 Subject: KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs Mark NX as being used for all non-nested shadow MMUs, as KVM will set the NX bit for huge SPTEs if the iTLB mutli-hit mitigation is enabled. Checking the mitigation itself is not sufficient as it can be toggled on at any time and KVM doesn't reset MMU contexts when that happens. KVM could reset the contexts, but that would require purging all SPTEs in all MMUs, for no real benefit. And, KVM already forces EFER.NX=1 when TDP is disabled (for WP=0, SMEP=1, NX=0), so technically NX is never reserved for shadow MMUs. Fixes: b8e8c8303ff2 ("kvm: mmu: ITLB_MULTIHIT mitigation") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b3be690d081a..444e068e6ad9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4221,7 +4221,15 @@ static inline u64 reserved_hpa_bits(void) void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - bool uses_nx = context->nx || + /* + * KVM uses NX when TDP is disabled to handle a variety of scenarios, + * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and + * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. + * The iTLB multi-hit workaround can be toggled at any time, so assume + * NX can be used by any non-nested shadow MMU to avoid having to reset + * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. + */ + bool uses_nx = context->nx || !tdp_enabled || context->mmu_role.base.smep_andnot_wp; struct rsvd_bits_validate *shadow_zero_check; int i; -- cgit v1.2.3 From 0aa1837533e5f4be8cc21bbc06314c23ba2c5447 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:48 -0700 Subject: KVM: x86: Properly reset MMU context at vCPU RESET/INIT Reset the MMU context at vCPU INIT (and RESET for good measure) if CR0.PG was set prior to INIT. Simply re-initializing the current MMU is not sufficient as the current root HPA may not be usable in the new context. E.g. if TDP is disabled and INIT arrives while the vCPU is in long mode, KVM will fail to switch to the 32-bit pae_root and bomb on the next VM-Enter due to running with a 64-bit CR3 in 32-bit mode. This bug was papered over in both VMX and SVM, but still managed to rear its head in the MMU role on VMX. Because EFER.LMA=1 requires CR0.PG=1, kvm_calc_shadow_mmu_root_page_role() checks for EFER.LMA without first checking CR0.PG. VMX's RESET/INIT flow writes CR0 before EFER, and so an INIT with the vCPU in 64-bit mode will cause the hack-a-fix to generate the wrong MMU role. In VMX, the INIT issue is specific to running without unrestricted guest since unrestricted guest is available if and only if EPT is enabled. Commit 8668a3c468ed ("KVM: VMX: Reset mmu context when entering real mode") resolved the issue by forcing a reset when entering emulated real mode. In SVM, commit ebae871a509d ("kvm: svm: reset mmu on VCPU reset") forced a MMU reset on every INIT to workaround the flaw in common x86. Note, at the time the bug was fixed, the SVM problem was exacerbated by a complete lack of a CR4 update. The vendor resets will be reverted in future patches, primarily to aid bisection in case there are non-INIT flows that rely on the existing VMX logic. Because CR0.PG is unconditionally cleared on INIT, and because CR0.WP and all CR4/EFER paging bits are ignored if CR0.PG=0, simply checking that CR0.PG was '1' prior to INIT/RESET is sufficient to detect a required MMU context reset. Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5833b8780808..4bd10fb1dfd6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10754,6 +10754,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + kvm_lapic_reset(vcpu, init_event); vcpu->arch.hflags = 0; @@ -10822,6 +10824,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + + /* + * Reset the MMU context if paging was enabled prior to INIT (which is + * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the + * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be + * checked because it is unconditionally cleared on INIT and all other + * paging related bits are ignored if paging is disabled, i.e. CR0.WP, + * CR4, and EFER changes are all irrelevant if CR0.PG was '0'. + */ + if (old_cr0 & X86_CR0_PG) + kvm_mmu_reset_context(vcpu); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) -- cgit v1.2.3 From ef318b9edf66a082f23d00d79b70c17b4c055a26 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:49 -0700 Subject: KVM: x86/mmu: Use MMU's role to detect CR4.SMEP value in nested NPT walk Use the MMU's role to get its effective SMEP value when injecting a fault into the guest. When walking L1's (nested) NPT while L2 is active, vCPU state will reflect L2, whereas NPT uses the host's (L1 in this case) CR0, CR4, EFER, etc... If L1 and L2 have different settings for SMEP and L1 does not have EFER.NX=1, this can result in an incorrect PFEC.FETCH when injecting #NPF. Fixes: e57d4a356ad3 ("KVM: Add instruction fetch checking when walking guest page table") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 823a5919f9fa..52fffd68b522 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -471,8 +471,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) + if (fetch_fault && (mmu->nx || mmu->mmu_role.ext.cr4_smep)) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; -- cgit v1.2.3 From f71a53d1180d5ecc346f0c6a23191d837fe2871b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:50 -0700 Subject: Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack" Restore CR4.LA57 to the mmu_role to fix an amusing edge case with nested virtualization. When KVM (L0) is using TDP, CR4.LA57 is not reflected in mmu_role.base.level because that tracks the shadow root level, i.e. TDP level. Normally, this is not an issue because LA57 can't be toggled while long mode is active, i.e. the guest has to first disable paging, then toggle LA57, then re-enable paging, thus ensuring an MMU reinitialization. But if L1 is crafty, it can load a new CR4 on VM-Exit and toggle LA57 without having to bounce through an unpaged section. L1 can also load a new CR3 on exit, i.e. it doesn't even need to play crazy paging games, a single entry PML5 is sufficient. Such shenanigans are only problematic if L0 and L1 use TDP, otherwise L1 and L2 share an MMU that gets reinitialized on nested VM-Enter/VM-Exit due to mmu_role.base.guest_mode. Note, in the L2 case with nested TDP, even though L1 can switch between L2s with different LA57 settings, thus bypassing the paging requirement, in that case KVM's nested_mmu will track LA57 in base.level. This reverts commit 8053f924cad30bf9f9a24e02b6c8ddfabf5202ea. Fixes: 8053f924cad3 ("KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu/mmu.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 408051552121..a474cd13b0c8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -320,6 +320,7 @@ union kvm_mmu_extended_role { unsigned int cr4_pke:1; unsigned int cr4_smap:1; unsigned int cr4_smep:1; + unsigned int cr4_la57:1; unsigned int maxphyaddr:6; }; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 444e068e6ad9..fa35762f325c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4537,6 +4537,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); ext.cr4_pse = !!is_pse(vcpu); ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); + ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); ext.maxphyaddr = cpuid_maxphyaddr(vcpu); ext.valid = 1; -- cgit v1.2.3 From 49c6f8756cdffeb9af1fbcb86bacacced26465d7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:51 -0700 Subject: KVM: x86: Force all MMUs to reinitialize if guest CPUID is modified Invalidate all MMUs' roles after a CPUID update to force reinitizliation of the MMU context/helpers. Despite the efforts of commit de3ccd26fafc ("KVM: MMU: record maximum physical address width in kvm_mmu_extended_role"), there are still a handful of CPUID-based properties that affect MMU behavior but are not incorporated into mmu_role. E.g. 1gb hugepage support, AMD vs. Intel handling of bit 8, and SEV's C-Bit location all factor into the guest's reserved PTE bits. The obvious alternative would be to add all such properties to mmu_role, but doing so provides no benefit over simply forcing a reinitialization on every CPUID update, as setting guest CPUID is a rare operation. Note, reinitializing all MMUs after a CPUID update does not fix all of KVM's woes. Specifically, kvm_mmu_page_role doesn't track the CPUID properties, which means that a vCPU can reuse shadow pages that should not exist for the new vCPU model, e.g. that map GPAs that are now illegal (due to MAXPHYADDR changes) or that set bits that are now reserved (PAGE_SIZE for 1gb pages), etc... Tracking the relevant CPUID properties in kvm_mmu_page_role would address the majority of problems, but fully tracking that much state in the shadow page role comes with an unpalatable cost as it would require a non-trivial increase in KVM's memory footprint. The GBPAGES case is even worse, as neither Intel nor AMD provides a way to disable 1gb hugepage support in the hardware page walker, i.e. it's a virtualization hole that can't be closed when using TDP. In other words, resetting the MMU after a CPUID update is largely a superficial fix. But, it will allow reverting the tracking of MAXPHYADDR in the mmu_role, and that case in particular needs to mostly work because KVM's shadow_root_level depends on guest MAXPHYADDR when 5-level paging is supported. For cases where KVM botches guest behavior, the damage is limited to that guest. But for the shadow_root_level, a misconfigured MMU can cause KVM to incorrectly access memory, e.g. due to walking off the end of its shadow page tables. Fixes: 7dcd57552008 ("x86/kvm/mmu: check if tdp/shadow MMU reconfiguration is needed") Cc: Yu Zhang Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 6 +++--- arch/x86/kvm/mmu/mmu.c | 12 ++++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a474cd13b0c8..f1e4d5f2bf8d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1496,6 +1496,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b4da665bb892..c42613cfb5ba 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -202,10 +202,10 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu); /* - * Except for the MMU, which needs to be reset after any vendor - * specific adjustments to the reserved GPA bits. + * Except for the MMU, which needs to do its thing any vendor specific + * adjustments to the reserved GPA bits. */ - kvm_mmu_reset_context(vcpu); + kvm_mmu_after_set_cpuid(vcpu); } static int is_efer_nx(void) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fa35762f325c..1ab3fdb1f2e4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4903,6 +4903,18 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) return role.base; } +void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + /* + * Invalidate all MMU roles to force them to reinitialize as CPUID + * information is factored into reserved bit calculations. + */ + vcpu->arch.root_mmu.mmu_role.ext.valid = 0; + vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; + vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; + kvm_mmu_reset_context(vcpu); +} + void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); -- cgit v1.2.3 From 63f5a1909f9e465eb446274969f65471794deafb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:52 -0700 Subject: KVM: x86: Alert userspace that KVM_SET_CPUID{,2} after KVM_RUN is broken Warn userspace that KVM_SET_CPUID{,2} after KVM_RUN "may" cause guest instability. Initialize last_vmentry_cpu to -1 and use it to detect if the vCPU has been run at least once when its CPUID model is changed. KVM does not correctly handle changes to paging related settings in the guest's vCPU model after KVM_RUN, e.g. MAXPHYADDR, GBPAGES, etc... KVM could theoretically zap all shadow pages, but actually making that happen is a mess due to lock inversion (vcpu->mutex is held). And even then, updating paging settings on the fly would only work if all vCPUs are stopped, updated in concert with identical settings, then restarted. To support running vCPUs with different vCPU models (that affect paging), KVM would need to track all relevant information in kvm_mmu_page_role. Note, that's the _page_ role, not the full mmu_role. Updating mmu_role isn't sufficient as a vCPU can reuse a shadow page translation that was created by a vCPU with different settings and thus completely skip the reserved bit checks (that are tied to CPUID). Tracking CPUID state in kvm_mmu_page_role is _extremely_ undesirable as it would require doubling gfn_track from a u16 to a u32, i.e. would increase KVM's memory footprint by 2 bytes for every 4kb of guest memory. E.g. MAXPHYADDR (6 bits), GBPAGES, AMD vs. INTEL = 1 bit, and SEV C-BIT would all need to be tracked. In practice, there is no remotely sane use case for changing any paging related CPUID entries on the fly, so just sweep it under the rug (after yelling at userspace). Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 ++++++++--- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++++++++++ arch/x86/kvm/x86.c | 2 ++ 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b87fa32835f2..5d8db4922df6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -688,9 +688,14 @@ MSRs that have been set successfully. Defines the vcpu responses to the cpuid instruction. Applications should use the KVM_SET_CPUID2 ioctl if available. -Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID -configuration (if there is) is not corrupted. Userspace can get a copy of the -resulting CPUID configuration through KVM_GET_CPUID2 in case. +Caveat emptor: + - If this IOCTL fails, KVM gives no guarantees that previous valid CPUID + configuration (if there is) is not corrupted. Userspace can get a copy + of the resulting CPUID configuration through KVM_GET_CPUID2 in case. + - Using KVM_SET_CPUID{,2} after KVM_RUN, i.e. changing the guest vCPU model + after running the guest, may cause guest instability. + - Using heterogeneous CPUID configurations, modulo APIC IDs, topology, etc... + may cause guest instability. :: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f1e4d5f2bf8d..f8faf3efc08d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -840,7 +840,7 @@ struct kvm_vcpu_arch { bool l1tf_flush_l1d; /* Host CPU on which VM-entry was most recently attempted */ - unsigned int last_vmentry_cpu; + int last_vmentry_cpu; /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1ab3fdb1f2e4..36201c02a472 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4913,6 +4913,26 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); + + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise + * sweep the problem under the rug. + * + * KVM's horrific CPUID ABI makes the problem all but impossible to + * solve, as correctly handling multiple vCPU models (with respect to + * paging and physical address properties) in a single VM would require + * tracking all relevant CPUID information in kvm_mmu_page_role. That + * is very undesirable as it would double the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments), and in practice + * no sane VMM mucks with the core vCPU model on the fly. + */ + if (vcpu->arch.last_vmentry_cpu != -1) { + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); + pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); + } } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4bd10fb1dfd6..c862783035b8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10602,6 +10602,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; + vcpu->arch.last_vmentry_cpu = -1; + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else -- cgit v1.2.3 From 6c032f12dd1e80a9dcd4847feab134d14e5551f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:53 -0700 Subject: Revert "KVM: MMU: record maximum physical address width in kvm_mmu_extended_role" Drop MAXPHYADDR from mmu_role now that all MMUs have their role invalidated after a CPUID update. Invalidating the role forces all MMUs to re-evaluate the guest's MAXPHYADDR, and the guest's MAXPHYADDR can only be changed only through a CPUID update. This reverts commit de3ccd26fafc707b09792d9b633c8b5b48865315. Cc: Yu Zhang Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu/mmu.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8faf3efc08d..250915da1681 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -321,7 +321,6 @@ union kvm_mmu_extended_role { unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; - unsigned int maxphyaddr:6; }; }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 36201c02a472..54514f06714a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4538,7 +4538,6 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) ext.cr4_pse = !!is_pse(vcpu); ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); - ext.maxphyaddr = cpuid_maxphyaddr(vcpu); ext.valid = 1; -- cgit v1.2.3 From ddc16abbbae9cd21705323d47158fb9c334438ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:54 -0700 Subject: KVM: x86/mmu: Unconditionally zap unsync SPs when creating >4k SP at GFN When creating a new upper-level shadow page, zap unsync shadow pages at the same target gfn instead of attempting to sync the pages. This fixes a bug where an unsync shadow page could be sync'd with an incompatible context, e.g. wrong smm, is_guest, etc... flags. In practice, the bug is relatively benign as sync_page() is all but guaranteed to fail its check that the guest's desired gfn (for the to-be-sync'd page) matches the current gfn associated with the shadow page. I.e. kvm_sync_page() would end up zapping the page anyways. Alternatively, __kvm_sync_page() could be modified to explicitly verify the mmu_role of the unsync shadow page is compatible with the current MMU context. But, except for this specific case, __kvm_sync_page() is called iff the page is compatible, e.g. the transient sync in kvm_mmu_get_page() requires an exact role match, and the call from kvm_sync_mmu_roots() is only synchronizing shadow pages from the current MMU (which better be compatible or KVM has problems). And as described above, attempting to sync shadow pages when creating an upper-level shadow page is unlikely to succeed, e.g. zero successful syncs were observed when running Linux guests despite over a million attempts. Fixes: 9f1a122f970d ("KVM: MMU: allow more page become unsync at getting sp time") Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-10-seanjc@google.com> [Remove WARN_ON after __kvm_sync_page. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 54514f06714a..4af466f0ec6d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1843,24 +1843,6 @@ static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return __kvm_sync_page(vcpu, sp, invalid_list); } -/* @gfn should be write-protected at the call site */ -static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, - struct list_head *invalid_list) -{ - struct kvm_mmu_page *s; - bool ret = false; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (!s->unsync) - continue; - - WARN_ON(s->role.level != PG_LEVEL_4K); - ret |= kvm_sync_page(vcpu, s, invalid_list); - } - - return ret; -} - struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; @@ -1990,8 +1972,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; - bool need_sync = false; - bool flush = false; int collisions = 0; LIST_HEAD(invalid_list); @@ -2014,11 +1994,21 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, continue; } - if (!need_sync && sp->unsync) - need_sync = true; - - if (sp->role.word != role.word) + if (sp->role.word != role.word) { + /* + * If the guest is creating an upper-level page, zap + * unsync pages for the same gfn. While it's possible + * the guest is using recursive page tables, in all + * likelihood the guest has stopped using the unsync + * page and is installing a completely unrelated page. + * Unsync pages must not be left as is, because the new + * upper-level page will be write-protected. + */ + if (level > PG_LEVEL_4K && sp->unsync) + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); continue; + } if (direct_mmu) goto trace_get_page; @@ -2052,22 +2042,14 @@ trace_get_page: sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (!direct) { - /* - * we should do write protection before syncing pages - * otherwise the content of the synced shadow page may - * be inconsistent with guest page table. - */ account_shadowed(vcpu->kvm, sp); if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - - if (level > PG_LEVEL_4K && need_sync) - flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } trace_kvm_mmu_get_page(sp, true); - - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); out: + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; -- cgit v1.2.3 From 00a669780ffa8c4b5f3e37346b5bf45508dd15bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:55 -0700 Subject: KVM: x86/mmu: Use MMU role to check for matching guest page sizes Originally, __kvm_sync_page used to check the cr4_pae bit in the role to avoid zapping 4-byte kvm_mmu_pages when guest page size are 8-byte or the other way round. However, in commit 47c42e6b4192 ("KVM: x86: fix handling of role.cr4_pae and rename it to 'gpte_size'", 2019-03-28) it was observed that this did not work for nested EPT, where the page table size would be 8 bytes even if CR4.PAE=0. (Note that the check still has to be done for nested *NPT*, so it is not possible to use tdp_enabled or similar). Therefore, a hack was introduced to identify nested EPT shadow pages and unconditionally call __kvm_sync_page() on them. However, it is possible to do without the hack to identify nested EPT shadow pages: if EPT is active, there will be no shadow pages in non-EPT format, and all of them will have gpte_is_8_bytes set to true; we can just check the MMU role directly, and the test will always be true. Even for non-EPT shadow MMUs, this test should really always be true now that __kvm_sync_page() is called if and only if the role is an exact match (kvm_mmu_get_page()) or is part of the current MMU context (kvm_mmu_sync_roots()). A future commit will convert the likely-pointless check into a meaningful WARN to enforce that the mmu_roles of the current context and the shadow page are compatible. Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 3 --- arch/x86/kvm/mmu/mmu.c | 16 +++------------- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index 20d85daed395..ddbb23998742 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -192,9 +192,6 @@ Shadow pages contain the following information: Contains the value of cr4.smap && !cr0.wp for which the page is valid (pages for which this is true are different from other pages; see the treatment of cr0.wp=0 below). - role.ept_sp: - This is a virtual flag to denote a shadowed nested EPT page. ept_sp - is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. role.smm: Is 1 if the page is valid in system management mode. This field determines which of the kvm_memslots array was used to build this diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4af466f0ec6d..71a2ee755224 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1780,16 +1780,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else -static inline bool is_ept_sp(struct kvm_mmu_page *sp) -{ - return sp->role.cr0_wp && sp->role.smap_andnot_wp; -} - /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) || + union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; + + if (sp->role.gpte_is_8_bytes != mmu_role.gpte_is_8_bytes || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; @@ -4721,13 +4718,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.guest_mode = true; role.base.access = ACC_ALL; - /* - * WP=1 and NOT_WP=1 is an impossible combination, use WP and the - * SMAP variation to denote shadow EPT entries. - */ - role.base.cr0_wp = true; - role.base.smap_andnot_wp = true; - role.ext = kvm_calc_mmu_role_ext(vcpu); role.ext.execonly = execonly; -- cgit v1.2.3 From 2640b0865395b6a31f76d6eca9937dec3e876ca3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:56 -0700 Subject: KVM: x86/mmu: WARN and zap SP when sync'ing if MMU role mismatches When synchronizing a shadow page, WARN and zap the page if its mmu role isn't compatible with the current MMU context, where "compatible" is an exact match sans the bits that have no meaning in the overall MMU context or will be explicitly overwritten during the sync. Many of the helpers used by sync_page() are specific to the current context, updating a SMM vs. non-SMM shadow page would use the wrong memslots, updating L1 vs. L2 PTEs might work but would be extremely bizaree, and so on and so forth. Drop the guard with respect to 8-byte vs. 4-byte PTEs in __kvm_sync_page(), it was made useless when kvm_mmu_get_page() stopped trying to sync shadow pages irrespective of the current MMU context. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 5 +---- arch/x86/kvm/mmu/paging_tmpl.h | 27 +++++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 71a2ee755224..e4415e739807 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1784,10 +1784,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; - - if (sp->role.gpte_is_8_bytes != mmu_role.gpte_is_8_bytes || - vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 52fffd68b522..b632606a87d6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1030,13 +1030,36 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; int set_spte_ret = 0; - /* direct kvm_mmu_page can not be unsync. */ - BUG_ON(sp->role.direct); + /* + * Ignore various flags when verifying that it's safe to sync a shadow + * page using the current MMU context. + * + * - level: not part of the overall MMU role and will never match as the MMU's + * level tracks the root level + * - access: updated based on the new guest PTE + * - quadrant: not part of the overall MMU role (similar to level) + */ + const union kvm_mmu_page_role sync_role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + }; + + /* + * Direct pages can never be unsync, and KVM should never attempt to + * sync a shadow page for a different MMU context, e.g. if the role + * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the + * reserved bits checks will be wrong, etc... + */ + if (WARN_ON_ONCE(sp->role.direct || + (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) + return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); -- cgit v1.2.3 From 07dc4f35a44c8f85ba7262b56b70c3fcbc3b74fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jun 2021 12:49:19 -0400 Subject: KVM: x86/mmu: comment on kvm_mmu_get_page's syncing of pages Explain the usage of sync_page() in kvm_mmu_get_page(), which is subtle in how and why it differs from mmu_sync_children(). Signed-off-by: Sean Christopherson [Split out of a different patch by Sean. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e4415e739807..726e5b171543 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2008,8 +2008,17 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, goto trace_get_page; if (sp->unsync) { - /* The page is good, but __kvm_sync_page might still end - * up zapping it. If so, break in order to rebuild it. + /* + * The page is good, but is stale. __kvm_sync_page does + * get the latest guest state, but (unlike mmu_unsync_children) + * it doesn't write-protect the page or mark it synchronized! + * This way the validity of the mapping is ensured, but the + * overhead of write protection is not incurred until the + * guest invalidates the TLB mapping. This allows multiple + * SPs for a single gfn to be unsync. + * + * If the sync fails, the page is zapped. If so, break + * in order to rebuild it. */ if (!__kvm_sync_page(vcpu, sp, &invalid_list)) break; -- cgit v1.2.3 From 479a1efc8119d8699cca73d00625b28003d0a1f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:57 -0700 Subject: KVM: x86/mmu: Drop the intermediate "transient" __kvm_sync_page() Nove the kvm_unlink_unsync_page() call out of kvm_sync_page() and into it's sole caller, and fold __kvm_sync_page() into kvm_sync_page() since the latter becomes a pure pass-through. There really should be no reason for code to do a complete sync of a shadow page outside of the full kvm_mmu_sync_roots(), e.g. the one use case that creeped in turned out to be flawed and counter-productive. Drop the stale comment about @sp->gfn needing to be write-protected, as it directly contradicts the kvm_mmu_get_page() usage. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 726e5b171543..92b7ab1a0a77 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1780,9 +1780,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else -/* @sp->gfn should be write-protected at the call site */ -static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); @@ -1830,13 +1829,6 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } -static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list) -{ - kvm_unlink_unsync_page(vcpu->kvm, sp); - return __kvm_sync_page(vcpu, sp, invalid_list); -} - struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; @@ -1931,6 +1923,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, } for_each_sp(pages, sp, parents, i) { + kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } @@ -2009,7 +2002,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->unsync) { /* - * The page is good, but is stale. __kvm_sync_page does + * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the @@ -2020,7 +2013,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ - if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + if (!kvm_sync_page(vcpu, sp, &invalid_list)) break; WARN_ON(!list_empty(&invalid_list)); -- cgit v1.2.3 From 0337f585f57fc80a50e0645ca709512687185c72 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:58 -0700 Subject: KVM: x86/mmu: Rename unsync helper and update related comments Rename mmu_need_write_protect() to mmu_try_to_unsync_pages() and update a variety of related, stale comments. Add several new comments to call out subtle details, e.g. that upper-level shadow pages are write-tracked, and that can_unsync is false iff KVM is in the process of synchronizing pages. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 34 +++++++++++++++++++++++++--------- arch/x86/kvm/mmu/mmu_internal.h | 3 +-- arch/x86/kvm/mmu/spte.c | 10 ++++++++-- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 92b7ab1a0a77..dffa9486e642 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2458,17 +2458,33 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) +/* + * Attempt to unsync any shadow pages that can be reached by the specified gfn, + * KVM is creating a writable mapping for said gfn. Returns 0 if all pages + * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must + * be write-protected. + */ +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + /* + * Force write-protection if the page is being tracked. Note, the page + * track machinery is used to write-protect upper-level shadow pages, + * i.e. this guards the role.level == 4K assertion below! + */ if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) - return true; + return -EPERM; + /* + * The page is not write-tracked, mark existing shadow pages unsync + * unless KVM is synchronizing an unsync SP (can_unsync = false). In + * that case, KVM must complete emulation of the guest TLB flush before + * allowing shadow pages to become unsync (writable by the guest). + */ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return true; + return -EPERM; if (sp->unsync) continue; @@ -2499,8 +2515,8 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, * 2.2 Guest issues TLB flush. * That causes a VM Exit. * - * 2.3 kvm_mmu_sync_pages() reads sp->unsync. - * Since it is false, so it just returns. + * 2.3 Walking of unsync pages sees sp->unsync is + * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, @@ -2516,7 +2532,7 @@ bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, */ smp_wmb(); - return false; + return 0; } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, @@ -3461,8 +3477,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) * flush strictly after those changes are made. We only need to * ensure that the other CPU sets these flags before any actual * changes to the page tables are made. The comments in - * mmu_need_write_protect() describe what could go wrong if this - * requirement isn't satisfied. + * mmu_try_to_unsync_pages() describe what could go wrong if + * this requirement isn't satisfied. */ if (!smp_load_acquire(&sp->unsync) && !smp_load_acquire(&sp->unsync_children)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 18be103df9d5..35567293c1fd 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -122,8 +122,7 @@ static inline bool is_nx_huge_page_enabled(void) return READ_ONCE(nx_huge_pages); } -bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync); +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8e8e8da740a0..246e61e0771e 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -147,13 +147,19 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection - * is responsibility of mmu_get_page / kvm_sync_page. + * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ if (!can_unsync && is_writable_pte(old_spte)) goto out; - if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { + /* + * Unsync shadow pages that are reachable by the new, writable + * SPTE. Write-protect the SPTE if the page can't be unsync'd, + * e.g. it's write-tracked (upper-level SPs) or has one or more + * shadow pages and unsync'ing pages is not allowed. + */ + if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret |= SET_SPTE_WRITE_PROTECTED_PT; -- cgit v1.2.3 From dbc4739b6b3ed478531155c832573a3fb1ab32d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:56:59 -0700 Subject: KVM: x86: Fix sizes used to pass around CR0, CR4, and EFER When configuring KVM's MMU, pass CR0 and CR4 as unsigned longs, and EFER as a u64 in various flows (mostly MMU). Passing the params as u32s is functionally ok since all of the affected registers reserve bits 63:32 to zero (enforced by KVM), but it's technically wrong. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 4 ++-- arch/x86/kvm/mmu/mmu.c | 11 ++++++----- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bc11402df83b..47131b92b990 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -66,8 +66,8 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, - gpa_t nested_cr3); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer, gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dffa9486e642..f3c4c6349ddc 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4659,8 +4659,8 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - u32 cr0, u32 cr4, u32 efer, - union kvm_mmu_role new_role) + unsigned long cr0, unsigned long cr4, + u64 efer, union kvm_mmu_role new_role) { if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); @@ -4675,7 +4675,8 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte reset_shadow_zero_bits_mask(vcpu, context); } -static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = @@ -4697,8 +4698,8 @@ kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) return role; } -void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, - gpa_t nested_cr3) +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, + unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dca20f949b63..9f0e7ed672b2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1244,8 +1244,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, &user_kvm_nested_state->data.svm[0]; struct vmcb_control_area *ctl; struct vmcb_save_area *save; + unsigned long cr0; int ret; - u32 cr0; BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) > KVM_STATE_NESTED_SVM_VMCB_SIZE); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c862783035b8..0b059698cd5c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9095,8 +9095,8 @@ static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; + unsigned long cr0; char buf[512]; - u32 cr0; memset(buf, 0, 512); #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 31e96bc63655ba643e31d83d8652b43f01e43f5b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:00 -0700 Subject: KVM: nSVM: Add a comment to document why nNPT uses vmcb01, not vCPU state Add a comment in the nested NPT initialization flow to call out that it intentionally uses vmcb01 instead current vCPU state to get the effective hCR4 and hEFER for L1's NPT context. Note, despite nSVM's efforts to handle the case where vCPU state doesn't reflect L1 state, the MMU may still do the wrong thing due to pulling state from the vCPU instead of the passed in CR0/CR4/EFER values. This will be addressed in future commits. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-16-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9f0e7ed672b2..f17d8c9050c0 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -98,6 +98,12 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; + + /* + * The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note, + * when called via KVM_SET_NESTED_STATE, that state may _not_ match current + * vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required. + */ kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4, svm->vmcb01.ptr->save.efer, svm->nested.ctl.nested_cr3); -- cgit v1.2.3 From 18feaad3c6556192b0d28f0777b021d137076917 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:01 -0700 Subject: KVM: x86/mmu: Drop smep_andnot_wp check from "uses NX" for shadow MMUs Drop the smep_andnot_wp role check from the "uses NX" calculation now that all non-nested shadow MMUs treat NX as used via the !TDP check. The shadow MMU for nested NPT, which shares the helper, does not need to deal with SMEP (or WP) as NPT walks are always "user" accesses and WP is explicitly noted as being ignored: Table walks for guest page tables are always treated as user writes at the nested page table level. A table walk for the guest page itself is always treated as a user access at the nested page table level The host hCR0.WP bit is ignored under nested paging. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f3c4c6349ddc..588d789cc79f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4223,8 +4223,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. */ - bool uses_nx = context->nx || !tdp_enabled || - context->mmu_role.base.smep_andnot_wp; + bool uses_nx = context->nx || !tdp_enabled; struct rsvd_bits_validate *shadow_zero_check; int i; -- cgit v1.2.3 From 20f632bd0060e12fca083adc44b097231e2f4649 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:02 -0700 Subject: KVM: x86: Read and pass all CR0/CR4 role bits to shadow MMU helper Grab all CR0/CR4 MMU role bits from current vCPU state when initializing a non-nested shadow MMU. Extract the masks from kvm_post_set_cr{0,4}(), as the CR0/CR4 update masks must exactly match the mmu_role bits, with one exception (see below). The "full" CR0/CR4 will be used by future commits to initialize the MMU and its role, as opposed to the current approach of pulling everything from vCPU, which is incorrect for certain flows, e.g. nested NPT. CR4.LA57 is an exception, as it can be toggled on VM-Exit (for L1's MMU) but can't be toggled via MOV CR4 while long mode is active. I.e. LA57 needs to be in the mmu_role, but technically doesn't need to be checked by kvm_post_set_cr4(). However, the extra check is completely benign as the hardware restrictions simply mean LA57 will never be _the_ cause of a MMU reset during MOV CR4. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-18-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 6 ++++++ arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 9 ++------- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 47131b92b990..4e926f4935b0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,6 +44,12 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \ + X86_CR4_LA57) + +#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) + static __always_inline u64 rsvd_bits(int s, int e) { BUILD_BUG_ON(__builtin_constant_p(e) && __builtin_constant_p(s) && e < s); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 588d789cc79f..51a48f17c80e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4778,8 +4778,8 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, - kvm_read_cr0_bits(vcpu, X86_CR0_PG), - kvm_read_cr4_bits(vcpu, X86_CR4_PAE), + kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), + kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), vcpu->arch.efer); context->get_guest_pgd = get_cr3; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b059698cd5c..a7c7b2b28de7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -851,14 +851,12 @@ EXPORT_SYMBOL_GPL(load_pdptrs); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; - if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); } - if ((cr0 ^ old_cr0) & update_bits) + if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (((cr0 ^ old_cr0) & X86_CR0_CD) && @@ -1037,10 +1035,7 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - - if (((cr4 ^ old_cr4) & mmu_role_bits) || + if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); } -- cgit v1.2.3 From 16be1d12925305d4d20fd897632d9a6836a865c8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:03 -0700 Subject: KVM: x86/mmu: Move nested NPT reserved bit calculation into MMU proper Move nested NPT's invocation of reset_shadow_zero_bits_mask() into the MMU proper and unexport said function. Aside from dropping an export, this is a baby step toward eliminating the call entirely by fixing the shadow_root_level confusion. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 3 --- arch/x86/kvm/mmu/mmu.c | 11 ++++++++--- arch/x86/kvm/svm/nested.c | 1 - 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 4e926f4935b0..62844bacd13f 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -68,9 +68,6 @@ static __always_inline u64 rsvd_bits(int s, int e) void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); -void -reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); - void kvm_init_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 51a48f17c80e..0c23a6d5722d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4212,8 +4212,8 @@ static inline u64 reserved_hpa_bits(void) * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ -void -reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { /* * KVM uses NX when TDP is disabled to handle a variety of scenarios, @@ -4247,7 +4247,6 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) } } -EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); static inline bool boot_cpu_is_amd(void) { @@ -4714,6 +4713,12 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, */ context->shadow_root_level = new_role.base.level; } + + /* + * Redo the shadow bits, the reset done by shadow_mmu_init_context() + * (above) may use the wrong shadow_root_level. + */ + reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f17d8c9050c0..a9e3b0736c20 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -110,7 +110,6 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } -- cgit v1.2.3 From d555f7057ebe34aae42fe2f592a3047e9b151326 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:04 -0700 Subject: KVM: x86/mmu: Grab shadow root level from mmu_role for shadow MMUs Use the mmu_role to initialize shadow root level instead of assuming the level of KVM's shadow root (host) is the same as that of the guest root, or in the case of 32-bit non-PAE paging where KVM forces PAE paging. For nested NPT, the shadow root level cannot be adapted to L1's NPT root level and is instead always the TDP root level because NPT uses the current host CR0/CR4/EFER, e.g. 64-bit KVM can't drop into 32-bit PAE to shadow L1's NPT. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c23a6d5722d..466cb93eb3b5 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3898,7 +3898,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->root_level = 0; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; context->nx = false; } @@ -4466,10 +4465,10 @@ static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu static void paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - int level) + int root_level) { context->nx = is_nx(vcpu); - context->root_level = level; + context->root_level = root_level; reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); @@ -4481,7 +4480,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->shadow_root_level = level; context->direct_map = false; } @@ -4509,7 +4507,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } @@ -4669,6 +4666,8 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte else paging32_init_context(vcpu, context); + context->shadow_root_level = new_role.base.level; + context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } @@ -4704,16 +4703,9 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); - if (new_role.as_u64 != context->mmu_role.as_u64) { + if (new_role.as_u64 != context->mmu_role.as_u64) shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); - /* - * Override the level set by the common init helper, nested TDP - * always uses the host's TDP configuration. - */ - context->shadow_root_level = new_role.base.level; - } - /* * Redo the shadow bits, the reset done by shadow_mmu_init_context() * (above) may use the wrong shadow_root_level. -- cgit v1.2.3 From 594e91a100ccab334675c4fc9145e6ef3c788449 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:05 -0700 Subject: KVM: x86/mmu: Add struct and helpers to retrieve MMU role bits from regs Introduce "struct kvm_mmu_role_regs" to hold the register state that is incorporated into the mmu_role. For nested TDP, the register state that is factored into the MMU isn't vCPU state; the dedicated struct will be used to propagate the correct state throughout the flows without having to pass multiple params, and also provides helpers for the various flag accessors. Intentionally make the new helpers cumbersome/ugly by prepending four underscores. In the not-too-distant future, it will be preferable to use the mmu_role to query bits as the mmu_role can drop irrelevant bits without creating contradictions, e.g. clearing CR4 bits when CR0.PG=0. Reserve the clean helper names (no underscores) for the mmu_role. Add a helper for vCPU conversion, which is the common case. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 66 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 466cb93eb3b5..a9c968bb3eae 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -176,9 +176,46 @@ static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); +struct kvm_mmu_role_regs { + const unsigned long cr0; + const unsigned long cr4; + const u64 efer; +}; + #define CREATE_TRACE_POINTS #include "mmutrace.h" +/* + * Yes, lot's of underscores. They're a hint that you probably shouldn't be + * reading from the role_regs. Once the mmu_role is constructed, it becomes + * the single source of truth for the MMU's state. + */ +#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ +static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +{ \ + return !!(regs->reg & flag); \ +} +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); +BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); +BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); + +static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_role_regs regs = { + .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), + .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), + .efer = vcpu->arch.efer, + }; + + return regs; +} static inline bool kvm_available_flush_tlb_with_range(void) { @@ -4654,14 +4691,14 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, - unsigned long cr0, unsigned long cr4, - u64 efer, union kvm_mmu_role new_role) + struct kvm_mmu_role_regs *regs, + union kvm_mmu_role new_role) { - if (!(cr0 & X86_CR0_PG)) + if (!____is_cr0_pg(regs)) nonpaging_init_context(vcpu, context); - else if (efer & EFER_LMA) + else if (____is_efer_lma(regs)) paging64_init_context(vcpu, context); - else if (cr4 & X86_CR4_PAE) + else if (____is_cr4_pae(regs)) paging32E_init_context(vcpu, context); else paging32_init_context(vcpu, context); @@ -4672,15 +4709,15 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte reset_shadow_zero_bits_mask(vcpu, context); } -static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, - unsigned long cr4, u64 efer) +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, regs, new_role); } static union kvm_mmu_role @@ -4699,12 +4736,17 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; + struct kvm_mmu_role_regs regs = { + .cr0 = cr0, + .cr4 = cr4, + .efer = efer, + }; union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); + shadow_mmu_init_context(vcpu, context, ®s, new_role); /* * Redo the shadow bits, the reset done by shadow_mmu_init_context() @@ -4773,11 +4815,9 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); - kvm_init_shadow_mmu(vcpu, - kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), - kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), - vcpu->arch.efer); + kvm_init_shadow_mmu(vcpu, ®s); context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; -- cgit v1.2.3 From af098972295aab280b362090aef964d4eb89f63f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:06 -0700 Subject: KVM: x86/mmu: Consolidate misc updates into shadow_mmu_init_context() Consolidate the MMU metadata update calls to deduplicate code, and to prep for future cleanup. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9c968bb3eae..28bfe18eb416 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4507,11 +4507,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->nx = is_nx(vcpu); context->root_level = root_level; - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; @@ -4534,12 +4529,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, { context->nx = false; context->root_level = PT32_ROOT_LEVEL; - - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); - update_pkru_bitmask(vcpu, context, false); - update_last_nonleaf_level(vcpu, context); - context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; @@ -4703,6 +4692,12 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte else paging32_init_context(vcpu, context); + if (____is_cr0_pg(regs)) { + reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); + update_last_nonleaf_level(vcpu, context); + } context->shadow_root_level = new_role.base.level; context->mmu_role.as_u64 = new_role.as_u64; -- cgit v1.2.3 From cd6767c334b628cf566db56c778e67f7e6ae2845 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:07 -0700 Subject: KVM: x86/mmu: Ignore CR0 and CR4 bits in nested EPT MMU role Do not incorporate CR0/CR4 bits into the role for the nested EPT MMU, as EPT behavior is not influenced by CR0/CR4. Note, this is the guest_mmu, (L1's EPT), not nested_mmu (L2's IA32 paging); the nested_mmu does need CR0/CR4, and is initialized in a separate flow. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 28bfe18eb416..52e405555cd6 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4767,8 +4767,10 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, role.base.guest_mode = true; role.base.access = ACC_ALL; - role.ext = kvm_calc_mmu_role_ext(vcpu); + /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ + role.ext.word = 0; role.ext.execonly = execonly; + role.ext.valid = 1; return role; } -- cgit v1.2.3 From 8626c120baefe68d22a22d6af9a7eed0b50bee90 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:08 -0700 Subject: KVM: x86/mmu: Use MMU's role_regs, not vCPU state, to compute mmu_role Use the provided role_regs to calculate the mmu_role instead of pulling bits from current vCPU state. For some flows, e.g. nested TDP, the vCPU state may not be correct (or relevant). Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 92 ++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 52e405555cd6..81992ba2899f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4542,17 +4542,18 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) +static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_extended_role ext = {0}; - ext.cr0_pg = !!is_paging(vcpu); - ext.cr4_pae = !!is_pae(vcpu); - ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - ext.cr4_pse = !!is_pse(vcpu); - ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); - ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); + ext.cr0_pg = ____is_cr0_pg(regs); + ext.cr4_pae = ____is_cr4_pae(regs); + ext.cr4_smep = ____is_cr4_smep(regs); + ext.cr4_smap = ____is_cr4_smap(regs); + ext.cr4_pse = ____is_cr4_pse(regs); + ext.cr4_pke = ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_cr4_la57(regs); ext.valid = 1; @@ -4560,20 +4561,21 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) } static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = {0}; role.base.access = ACC_ALL; - role.base.nxe = !!is_nx(vcpu); - role.base.cr0_wp = is_write_protection(vcpu); + role.base.nxe = ____is_efer_nx(regs); + role.base.cr0_wp = ____is_cr0_wp(regs); role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); if (base_only) return role; - role.ext = kvm_calc_mmu_role_ext(vcpu); + role.ext = kvm_calc_mmu_role_ext(vcpu, regs); return role; } @@ -4588,9 +4590,10 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4603,8 +4606,9 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.root_mmu; + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role new_role = - kvm_calc_tdp_mmu_root_page_role(vcpu, false); + kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -4648,30 +4652,30 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { - union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); - role.base.smep_andnot_wp = role.ext.cr4_smep && - !is_write_protection(vcpu); - role.base.smap_andnot_wp = role.ext.cr4_smap && - !is_write_protection(vcpu); - role.base.gpte_is_8_bytes = !!is_pae(vcpu); + role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); + role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); + role.base.gpte_is_8_bytes = ____is_cr4_pae(regs); return role; } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs, bool base_only) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, base_only); + kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); - role.base.direct = !is_paging(vcpu); + role.base.direct = !____is_cr0_pg(regs); - if (!is_long_mode(vcpu)) + if (!____is_efer_lma(regs)) role.base.level = PT32E_ROOT_LEVEL; - else if (is_la57_mode(vcpu)) + else if (____is_cr4_la57(regs)) role.base.level = PT64_ROOT_5LEVEL; else role.base.level = PT64_ROOT_4LEVEL; @@ -4709,17 +4713,18 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); + kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); if (new_role.as_u64 != context->mmu_role.as_u64) shadow_mmu_init_context(vcpu, context, regs, new_role); } static union kvm_mmu_role -kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, + struct kvm_mmu_role_regs *regs) { union kvm_mmu_role role = - kvm_calc_shadow_root_page_role_common(vcpu, false); + kvm_calc_shadow_root_page_role_common(vcpu, regs, false); role.base.direct = false; role.base.level = kvm_mmu_get_tdp_level(vcpu); @@ -4736,7 +4741,9 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, .cr4 = cr4, .efer = efer, }; - union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + union kvm_mmu_role new_role; + + new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); @@ -4821,9 +4828,12 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) context->inject_page_fault = kvm_inject_page_fault; } -static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) +static union kvm_mmu_role +kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) { - union kvm_mmu_role role = kvm_calc_shadow_root_page_role_common(vcpu, false); + union kvm_mmu_role role; + + role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); /* * Nested MMUs are used only for walking L2's gva->gpa, they never have @@ -4832,12 +4842,12 @@ static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) */ role.base.direct = true; - if (!is_paging(vcpu)) + if (!____is_cr0_pg(regs)) role.base.level = 0; - else if (is_long_mode(vcpu)) - role.base.level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - else if (is_pae(vcpu)) + else if (____is_efer_lma(regs)) + role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (____is_cr4_pae(regs)) role.base.level = PT32E_ROOT_LEVEL; else role.base.level = PT32_ROOT_LEVEL; @@ -4847,7 +4857,8 @@ static union kvm_mmu_role kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu) static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { - union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu); + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); + union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_role.as_u64 == g_context->mmu_role.as_u64) @@ -4913,12 +4924,13 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) { + struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_mmu_role role; if (tdp_enabled) - role = kvm_calc_tdp_mmu_root_page_role(vcpu, true); + role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); else - role = kvm_calc_shadow_mmu_root_page_role(vcpu, true); + role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); return role.base; } -- cgit v1.2.3 From 167f8a5cae99fb2050d3d674ca84457a526e23dd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:09 -0700 Subject: KVM: x86/mmu: Rename "nxe" role bit to "efer_nx" for macro shenanigans Rename "nxe" to "efer_nx" so that future macro magic can use the pattern _ for all CR0, CR4, and EFER bits that included in the role. Using "efer_nx" also makes it clear that the role bit reflects EFER.NX, not the NX bit in the corresponding PTE. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-25-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/mmu.rst | 4 ++-- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/mmutrace.h | 2 +- tools/lib/traceevent/plugins/plugin_kvm.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst index ddbb23998742..f60f5488e121 100644 --- a/Documentation/virt/kvm/mmu.rst +++ b/Documentation/virt/kvm/mmu.rst @@ -180,8 +180,8 @@ Shadow pages contain the following information: role.gpte_is_8_bytes: Reflects the size of the guest PTE for which the page is valid, i.e. '1' if 64-bit gptes are in use, '0' if 32-bit gptes are in use. - role.nxe: - Contains the value of efer.nxe for which the page is valid. + role.efer_nx: + Contains the value of efer.nx for which the page is valid. role.cr0_wp: Contains the value of cr0.wp for which the page is valid. role.smep_andnot_wp: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 250915da1681..520140eed423 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -274,7 +274,7 @@ struct kvm_kernel_irq_routing_entry; * by indirect shadow page can not be more than 15 bits. * * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access, - * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. + * @efer_nx, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. */ union kvm_mmu_page_role { u32 word; @@ -285,7 +285,7 @@ union kvm_mmu_page_role { unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned nxe:1; + unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 81992ba2899f..25f23de89cdf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4567,7 +4567,7 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, union kvm_mmu_role role = {0}; role.base.access = ACC_ALL; - role.base.nxe = ____is_efer_nx(regs); + role.base.efer_nx = ____is_efer_nx(regs); role.base.cr0_wp = ____is_cr0_wp(regs); role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index e798489b56b5..efbad33a0645 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -40,7 +40,7 @@ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ - role.nxe ? "" : "!", \ + role.efer_nx ? "" : "!", \ role.ad_disabled ? "!" : "", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ diff --git a/tools/lib/traceevent/plugins/plugin_kvm.c b/tools/lib/traceevent/plugins/plugin_kvm.c index 51ceeb9147eb..9ce7b4b68e3f 100644 --- a/tools/lib/traceevent/plugins/plugin_kvm.c +++ b/tools/lib/traceevent/plugins/plugin_kvm.c @@ -366,7 +366,7 @@ union kvm_mmu_page_role { unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned nxe:1; + unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_and_not_wp:1; unsigned smap_and_not_wp:1; @@ -403,7 +403,7 @@ static int kvm_mmu_print_role(struct trace_seq *s, struct tep_record *record, access_str[role.access], role.invalid ? " invalid" : "", role.cr4_pae ? "" : "!", - role.nxe ? "" : "!", + role.efer_nx ? "" : "!", role.cr0_wp ? "" : "!", role.smep_and_not_wp ? " smep" : "", role.smap_and_not_wp ? " smap" : "", -- cgit v1.2.3 From 6066772455f21ce1e90f003243c9864091621773 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:10 -0700 Subject: KVM: x86/mmu: Add accessors to query mmu_role bits Add accessors via a builder macro for all mmu_role bits that track a CR0, CR4, or EFER bit, abstracting whether the bits are in the base or the extended role. Future commits will switch to using mmu_role instead of vCPU state to configure the MMU, i.e. there are about to be a large number of users. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-26-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 +++++++++++++++++++++ arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 25f23de89cdf..1e5beac6920f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -206,6 +206,27 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); +/* + * The MMU itself (with a valid role) is the single source of truth for the + * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The + * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, + * and the vCPU may be incorrect/irrelevant. + */ +#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ +static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +{ \ + return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ +} +BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); +BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); +BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); +BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); + static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index b632606a87d6..5cf36eb96ee2 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -471,7 +471,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || mmu->mmu_role.ext.cr4_smep)) + if (fetch_fault && (mmu->nx || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; -- cgit v1.2.3 From ca8d664f509932eb316a4ae3926176be745e3b3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:11 -0700 Subject: KVM: x86/mmu: Do not set paging-related bits in MMU role if CR0.PG=0 Don't set CR0/CR4/EFER bits in the MMU role if paging is disabled, paging modifiers are irrelevant if there is no paging in the first place. Somewhat arbitrarily clear gpte_is_8_bytes for shadow paging if paging is disabled in the guest. Again, there are no guest PTEs to process, so the size is meaningless. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-27-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1e5beac6920f..b109ea16d39e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4568,13 +4568,15 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, { union kvm_mmu_extended_role ext = {0}; - ext.cr0_pg = ____is_cr0_pg(regs); - ext.cr4_pae = ____is_cr4_pae(regs); - ext.cr4_smep = ____is_cr4_smep(regs); - ext.cr4_smap = ____is_cr4_smap(regs); - ext.cr4_pse = ____is_cr4_pse(regs); - ext.cr4_pke = ____is_cr4_pke(regs); - ext.cr4_la57 = ____is_cr4_la57(regs); + if (____is_cr0_pg(regs)) { + ext.cr0_pg = 1; + ext.cr4_pae = ____is_cr4_pae(regs); + ext.cr4_smep = ____is_cr4_smep(regs); + ext.cr4_smap = ____is_cr4_smap(regs); + ext.cr4_pse = ____is_cr4_pse(regs); + ext.cr4_pke = ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_cr4_la57(regs); + } ext.valid = 1; @@ -4588,8 +4590,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, union kvm_mmu_role role = {0}; role.base.access = ACC_ALL; - role.base.efer_nx = ____is_efer_nx(regs); - role.base.cr0_wp = ____is_cr0_wp(regs); + if (____is_cr0_pg(regs)) { + role.base.efer_nx = ____is_efer_nx(regs); + role.base.cr0_wp = ____is_cr0_wp(regs); + } role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); @@ -4680,7 +4684,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); - role.base.gpte_is_8_bytes = ____is_cr4_pae(regs); + role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs); return role; } -- cgit v1.2.3 From 84c679f5f52c7a98c9f0986ff89d50dc073b97f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:12 -0700 Subject: KVM: x86/mmu: Set CR4.PKE/LA57 in MMU role iff long mode is active Don't set cr4_pke or cr4_la57 in the MMU role if long mode isn't active, which is required for protection keys and 5-level paging to be fully enabled. Ignoring the bit avoids unnecessary reconfiguration on reuse, and also means consumers of mmu_role don't need to manually check for long mode. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-28-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b109ea16d39e..eb80d8a4bead 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4574,8 +4574,10 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, ext.cr4_smep = ____is_cr4_smep(regs); ext.cr4_smap = ____is_cr4_smap(regs); ext.cr4_pse = ____is_cr4_pse(regs); - ext.cr4_pke = ____is_cr4_pke(regs); - ext.cr4_la57 = ____is_cr4_la57(regs); + + /* PKEY and LA57 are active iff long mode is active. */ + ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); + ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); } ext.valid = 1; -- cgit v1.2.3 From 18db1b1790a899880dc4afdb9ac6c82c91080d66 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:13 -0700 Subject: KVM: x86/mmu: Always set new mmu_role immediately after checking old role Refactor shadow MMU initialization to immediately set its new mmu_role after verifying it differs from the old role, and so that all flavors of MMU initialization share the same check-and-set pattern. Immediately setting the role will allow future commits to use mmu_role to configure the MMU without consuming stale state. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-29-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index eb80d8a4bead..f5a55c97284c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4714,6 +4714,11 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte struct kvm_mmu_role_regs *regs, union kvm_mmu_role new_role) { + if (new_role.as_u64 == context->mmu_role.as_u64) + return; + + context->mmu_role.as_u64 = new_role.as_u64; + if (!____is_cr0_pg(regs)) nonpaging_init_context(vcpu, context); else if (____is_efer_lma(regs)) @@ -4731,7 +4736,6 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte } context->shadow_root_level = new_role.base.level; - context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } @@ -4742,8 +4746,7 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); - if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, regs, new_role); + shadow_mmu_init_context(vcpu, context, regs, new_role); } static union kvm_mmu_role @@ -4774,8 +4777,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); - if (new_role.as_u64 != context->mmu_role.as_u64) - shadow_mmu_init_context(vcpu, context, ®s, new_role); + shadow_mmu_init_context(vcpu, context, ®s, new_role); /* * Redo the shadow bits, the reset done by shadow_mmu_init_context() @@ -4823,6 +4825,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, if (new_role.as_u64 == context->mmu_role.as_u64) return; + context->mmu_role.as_u64 = new_role.as_u64; + context->shadow_root_level = level; context->nx = true; @@ -4833,7 +4837,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->root_level = level; context->direct_map = false; - context->mmu_role.as_u64 = new_role.as_u64; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); -- cgit v1.2.3 From 8c985b2d8e682edac84bde63cef660cc574f795e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:14 -0700 Subject: KVM: x86/mmu: Don't grab CR4.PSE for calculating shadow reserved bits Unconditionally pass pse=false when calculating reserved bits for shadow PTEs. CR4.PSE is only relevant for 32-bit non-PAE paging, which KVM does not use for shadow paging (including nested NPT). Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-30-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f5a55c97284c..d017352d76c8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4281,19 +4281,22 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. */ bool uses_nx = context->nx || !tdp_enabled; + + /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ + bool is_amd = true; + /* KVM doesn't use 2-level page tables for the shadow MMU. */ + bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; - /* - * Passing "true" to the last argument is okay; it adds a check - * on bit 8 of the SPTEs which KVM doesn't use anyway. - */ + WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); + shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(vcpu, shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), true); + is_pse, is_amd); if (!shadow_me_mask) return; @@ -4329,7 +4332,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), - true, true); + false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false); -- cgit v1.2.3 From 4e9c0d80dbbd2dd411d726ed10eccaaba6d63a08 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:15 -0700 Subject: KVM: x86/mmu: Use MMU's role to get CR4.PSE for computing rsvd bits Use the MMU's role to get CR4.PSE when calculating reserved bits for the guest's PTEs. Practically speaking, this is a glorified nop as the role always come from vCPU state for the relevant flows, but converting to the roles will provide consistency once everything else is converted, and will Just Work if the "always comes from vCPU" behavior were ever to change (unlikely). Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-31-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d017352d76c8..3a8af50e1510 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4216,7 +4216,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, vcpu->arch.reserved_gpa_bits, context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), + is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } -- cgit v1.2.3 From b705a277b7059673c93e7ada01cc446dfae3e85a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:16 -0700 Subject: KVM: x86/mmu: Drop vCPU param from reserved bits calculator Drop the vCPU param from __reset_rsvds_bits_mask() as it's now unused, and ideally will remain unused in the future. Any information that's needed by the low level helper should be explicitly provided as it's used for both shadow/host MMUs and guest MMUs, i.e. vCPU state may be meaningless or simply wrong. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-32-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3a8af50e1510..7651f9cbd12e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4119,8 +4119,7 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, #undef PTTYPE static void -__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct rsvd_bits_validate *rsvd_check, +__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { @@ -4212,7 +4211,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), @@ -4292,8 +4291,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); shadow_zero_check = &context->shadow_zero_check; - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, uses_nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); @@ -4328,8 +4326,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) - __reset_rsvds_bits_mask(vcpu, shadow_zero_check, - reserved_hpa_bits(), + __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, false, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); -- cgit v1.2.3 From c596f1470ab7adb9ba6edf301b1f8f29dcefb55f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:17 -0700 Subject: KVM: x86/mmu: Use MMU's role to compute permission bitmask Use the MMU's role to generate the permission bitmasks for the MMU. For some flows, the vCPU state may not be correct (or relevant), e.g. the nested NPT MMU can be initialized with incoherent vCPU state. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-33-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7651f9cbd12e..d1d25dd9ca91 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4365,8 +4365,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, (7 & (access) ? 128 : 0)) -static void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; @@ -4374,9 +4373,9 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); - bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0; - bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0; - bool cr0_wp = is_write_protection(vcpu); + bool cr4_smep = is_cr4_smep(mmu); + bool cr4_smap = is_cr4_smap(mmu); + bool cr0_wp = is_cr0_wp(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; @@ -4672,7 +4671,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - update_permission_bitmask(vcpu, context, false); + update_permission_bitmask(context, false); update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); @@ -4730,7 +4729,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte if (____is_cr0_pg(regs)) { reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(vcpu, context, false); + update_permission_bitmask(context, false); update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); } @@ -4838,7 +4837,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_level = level; context->direct_map = false; - update_permission_bitmask(vcpu, context, true); + update_permission_bitmask(context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); @@ -4935,7 +4934,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } - update_permission_bitmask(vcpu, g_context, false); + update_permission_bitmask(g_context, false); update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } -- cgit v1.2.3 From 2e4c06618d4024f760ba6dfab0978533bd00d03e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:18 -0700 Subject: KVM: x86/mmu: Use MMU's role to compute PKRU bitmask Use the MMU's role to calculate the Protection Keys (Restrict Userspace) bitmask instead of pulling bits from current vCPU state. For some flows, the vCPU state may not be correct (or relevant), e.g. EPT doesn't interact with PKRU. Case in point, the "ept" param simply disappears. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-34-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d1d25dd9ca91..4a6c1848d39f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4460,24 +4460,17 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ -static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept) +static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; - if (ept) { + if (!is_cr4_pke(mmu)) { mmu->pkru_mask = 0; return; } - /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { - mmu->pkru_mask = 0; - return; - } - - wp = is_write_protection(vcpu); + wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; @@ -4672,7 +4665,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(context, false); - update_pkru_bitmask(vcpu, context, false); + update_pkru_bitmask(context); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4730,7 +4723,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte if (____is_cr0_pg(regs)) { reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(context, false); - update_pkru_bitmask(vcpu, context, false); + update_pkru_bitmask(context); update_last_nonleaf_level(vcpu, context); } context->shadow_root_level = new_role.base.level; @@ -4838,8 +4831,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); + update_pkru_bitmask(context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4935,7 +4928,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(g_context, false); - update_pkru_bitmask(vcpu, g_context, false); + update_pkru_bitmask(g_context); update_last_nonleaf_level(vcpu, g_context); } -- cgit v1.2.3 From b67a93a87e1f9281a1d9f4a28052fed49b4591f1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:19 -0700 Subject: KVM: x86/mmu: Use MMU's roles to compute last non-leaf level Use the MMU's role to get CR4.PSE when determining the last level at which the guest _cannot_ create a non-leaf PTE, i.e. cannot create a huge page. Note, the existing logic is arguably wrong when considering 5-level paging and the case where 1gb pages aren't supported. In practice, the logic is confusing but not broken, because except for 32-bit non-PAE paging, bit 7 (_PAGE_PSE) bit is reserved when a huge page isn't supported at that level. I.e. setting bit 7 will terminate the guest walk one way or another. Furthermore, last_nonleaf_level is only consulted after KVM has verified there are no reserved bits set. All that confusion will be addressed in a future patch by dropping last_nonleaf_level entirely. For now, massage the code to continue the march toward using mmu_role for (almost) all MMU computations. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-35-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4a6c1848d39f..0ca2f9bd8284 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4504,12 +4504,12 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) } } -static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +static void update_last_nonleaf_level(struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; mmu->last_nonleaf_level = root_level; - if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) + if (root_level == PT32_ROOT_LEVEL && is_cr4_pse(mmu)) mmu->last_nonleaf_level++; } @@ -4666,7 +4666,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(context, false); update_pkru_bitmask(context); - update_last_nonleaf_level(vcpu, context); + update_last_nonleaf_level(context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4724,7 +4724,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(context, false); update_pkru_bitmask(context); - update_last_nonleaf_level(vcpu, context); + update_last_nonleaf_level(context); } context->shadow_root_level = new_role.base.level; @@ -4831,7 +4831,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_last_nonleaf_level(vcpu, context); + update_last_nonleaf_level(context); update_pkru_bitmask(context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); @@ -4929,7 +4929,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(g_context, false); update_pkru_bitmask(g_context); - update_last_nonleaf_level(vcpu, g_context); + update_last_nonleaf_level(g_context); } void kvm_init_mmu(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From cd628f0f1e1ce0709c2c6bc852b1a3abf9638b26 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:20 -0700 Subject: KVM: x86/mmu: Use MMU's role to detect EFER.NX in guest page walk Use the NX bit from the MMU's role instead of the MMU itself so that the redundant, dedicated "nx" flag can be dropped. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-36-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 5cf36eb96ee2..c92e712607b6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -471,7 +471,7 @@ retry_walk: error: errcode |= write_fault | user_fault; - if (fetch_fault && (mmu->nx || is_cr4_smep(mmu))) + if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; -- cgit v1.2.3 From 84a16226046d1c9339a9be3f2b76ea2dc5677f02 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:21 -0700 Subject: KVM: x86/mmu: Use MMU's role/role_regs to compute context's metadata Use the MMU's role and role_regs to calculate the MMU's guest root level and NX bit. For some flows, the vCPU state may not be correct (or relevant), e.g. EPT doesn't interact with EFER.NX and nested NPT will configure the guest_mmu with possibly-stale vCPU state. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-37-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0ca2f9bd8284..9c3bfc5cb527 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3948,8 +3948,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, max_level, true); } -static void nonpaging_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -4513,14 +4512,13 @@ static void update_last_nonleaf_level(struct kvm_mmu *mmu) mmu->last_nonleaf_level++; } -static void paging64_init_context_common(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, +static void paging64_init_context_common(struct kvm_mmu *context, int root_level) { - context->nx = is_nx(vcpu); + context->nx = is_efer_nx(context); context->root_level = root_level; - MMU_WARN_ON(!is_pae(vcpu)); + WARN_ON_ONCE(!is_cr4_pae(context)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -4528,17 +4526,16 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->direct_map = false; } -static void paging64_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging64_init_context(struct kvm_mmu *context, + struct kvm_mmu_role_regs *regs) { - int root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; + int root_level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; - paging64_init_context_common(vcpu, context, root_level); + paging64_init_context_common(context, root_level); } -static void paging32_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32_init_context(struct kvm_mmu *context) { context->nx = false; context->root_level = PT32_ROOT_LEVEL; @@ -4549,10 +4546,9 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->direct_map = false; } -static void paging32E_init_context(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void paging32E_init_context(struct kvm_mmu *context) { - paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); + paging64_init_context_common(context, PT32E_ROOT_LEVEL); } static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, @@ -4712,13 +4708,13 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte context->mmu_role.as_u64 = new_role.as_u64; if (!____is_cr0_pg(regs)) - nonpaging_init_context(vcpu, context); + nonpaging_init_context(context); else if (____is_efer_lma(regs)) - paging64_init_context(vcpu, context); + paging64_init_context(context, regs); else if (____is_cr4_pae(regs)) - paging32E_init_context(vcpu, context); + paging32E_init_context(context); else - paging32_init_context(vcpu, context); + paging32_init_context(context); if (____is_cr0_pg(regs)) { reset_rsvds_bits_mask(vcpu, context); -- cgit v1.2.3 From 90599c280123618049af5cf375aae5b4e73bec03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:22 -0700 Subject: KVM: x86/mmu: Use MMU's role to get EFER.NX during MMU configuration Get the MMU's effective EFER.NX from its role instead of using the one-off, dedicated flag. This will allow dropping said flag in a future commit. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-38-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9c3bfc5cb527..5eaab1b73204 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4212,7 +4212,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, - context->root_level, context->nx, + context->root_level, is_efer_nx(context), guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); @@ -4278,7 +4278,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. */ - bool uses_nx = context->nx || !tdp_enabled; + bool uses_nx = is_efer_nx(context) || !tdp_enabled; /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; @@ -4375,6 +4375,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) bool cr4_smep = is_cr4_smep(mmu); bool cr4_smap = is_cr4_smap(mmu); bool cr0_wp = is_cr0_wp(mmu); + bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; @@ -4400,7 +4401,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ - if (!mmu->nx) + if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ -- cgit v1.2.3 From a4c93252fed1517362d2ce43c6a5fd50a1152ed6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:23 -0700 Subject: KVM: x86/mmu: Drop "nx" from MMU context now that there are no readers Drop kvm_mmu.nx as there no consumers left. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-39-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/mmu/mmu.c | 17 ----------------- 2 files changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 520140eed423..3f4f6ad7405b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -423,8 +423,6 @@ struct kvm_mmu { /* Can have large pages at levels 2..last_nonleaf_level-1. */ u8 last_nonleaf_level; - bool nx; - u64 pdptrs[4]; /* pae */ }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5eaab1b73204..91b27538328f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -322,11 +322,6 @@ static int is_cpuid_PSE36(void) return 1; } -static int is_nx(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.efer & EFER_NX; -} - static gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; @@ -3956,7 +3951,6 @@ static void nonpaging_init_context(struct kvm_mmu *context) context->invlpg = NULL; context->root_level = 0; context->direct_map = true; - context->nx = false; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, @@ -4516,7 +4510,6 @@ static void update_last_nonleaf_level(struct kvm_mmu *mmu) static void paging64_init_context_common(struct kvm_mmu *context, int root_level) { - context->nx = is_efer_nx(context); context->root_level = root_level; WARN_ON_ONCE(!is_cr4_pae(context)); @@ -4538,7 +4531,6 @@ static void paging64_init_context(struct kvm_mmu *context, static void paging32_init_context(struct kvm_mmu *context) { - context->nx = false; context->root_level = PT32_ROOT_LEVEL; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -4640,22 +4632,18 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->inject_page_fault = kvm_inject_page_fault; if (!is_paging(vcpu)) { - context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - context->nx = is_nx(vcpu); context->root_level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { - context->nx = is_nx(vcpu); context->root_level = PT32E_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else { - context->nx = false; context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging32_gva_to_gpa; @@ -4818,7 +4806,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->shadow_root_level = level; - context->nx = true; context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; @@ -4903,22 +4890,18 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { - g_context->nx = false; g_context->root_level = 0; g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { - g_context->nx = is_nx(vcpu); g_context->root_level = is_la57_mode(vcpu) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { - g_context->nx = is_nx(vcpu); g_context->root_level = PT32E_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { - g_context->nx = false; g_context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; -- cgit v1.2.3 From 5472fcd4c6c8026565644f31490cfddfdafb9519 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:24 -0700 Subject: KVM: x86/mmu: Get nested MMU's root level from the MMU's role Initialize the MMU's (guest) root_level using its mmu_role instead of redoing the calculations. The role_regs used to calculate the mmu_role are initialized from the vCPU, i.e. this should be a complete nop. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-40-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 91b27538328f..3d87b7fcf6b3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4874,6 +4874,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->get_guest_pgd = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; + g_context->root_level = new_role.base.level; /* * L2 page tables are never shadowed, so there is no need to sync @@ -4890,19 +4891,14 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { - g_context->root_level = 0; g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { - g_context->root_level = is_la57_mode(vcpu) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { - g_context->root_level = PT32E_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { - g_context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } -- cgit v1.2.3 From 87e99d7d7054f6a861f18b0e2f30280d2f526f23 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:25 -0700 Subject: KVM: x86/mmu: Use MMU role_regs to get LA57, and drop vCPU LA57 helper Get LA57 from the role_regs, which are initialized from the vCPU even though TDP is enabled, instead of pulling the value directly from the vCPU when computing the guest's root_level for TDP MMUs. Note, the check is inside an is_long_mode() statement, so that requirement is not lost. Use role_regs even though the MMU's role is available and arguably "better". A future commit will consolidate the guest root level logic, and it needs access to EFER.LMA, which is not tracked in the role (it can't be toggled on VM-Exit, unlike LA57). Drop is_la57_mode() as there are no remaining users, and to discourage pulling MMU state from the vCPU (in the future). No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-41-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/x86.h | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3d87b7fcf6b3..b5e63c4ed7d1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4635,7 +4635,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - context->root_level = is_la57_mode(vcpu) ? + context->root_level = ____is_cr4_la57(®s) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 521f74e5bbf2..44ae10312740 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -157,16 +157,6 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) return cs_l; } -static inline bool is_la57_mode(struct kvm_vcpu *vcpu) -{ -#ifdef CONFIG_X86_64 - return (vcpu->arch.efer & EFER_LMA) && - kvm_read_cr4_bits(vcpu, X86_CR4_LA57); -#else - return 0; -#endif -} - static inline bool x86_exception_has_error_code(unsigned int vector) { static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | -- cgit v1.2.3 From fa4b558802c0ed4ef8132c1b2d1e993c519eb0ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:26 -0700 Subject: KVM: x86/mmu: Consolidate reset_rsvds_bits_mask() calls Move calls to reset_rsvds_bits_mask() out of the various mode statements and under a more generic CR0.PG=1 check. This will allow for additional code consolidation in the future. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-42-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index b5e63c4ed7d1..9a06003811fd 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4637,18 +4637,18 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } else if (is_long_mode(vcpu)) { context->root_level = ____is_cr4_la57(®s) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; - reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { context->root_level = PT32E_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging64_gva_to_gpa; } else { context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context); context->gva_to_gpa = paging32_gva_to_gpa; } + if (is_cr0_pg(context)) + reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(context, false); update_pkru_bitmask(context); update_last_nonleaf_level(context); @@ -4890,18 +4890,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ - if (!is_paging(vcpu)) { + if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; - } else if (is_long_mode(vcpu)) { - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else if (is_pae(vcpu)) { - reset_rsvds_bits_mask(vcpu, g_context); + else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa_nested; - } else { - reset_rsvds_bits_mask(vcpu, g_context); + else g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - } + + if (is_cr0_pg(g_context)) + reset_rsvds_bits_mask(vcpu, g_context); update_permission_bitmask(g_context, false); update_pkru_bitmask(g_context); -- cgit v1.2.3 From af0eb17e99e5df76380404881e3e5042d582a6b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:27 -0700 Subject: KVM: x86/mmu: Don't update nested guest's paging bitmasks if CR0.PG=0 Don't bother updating the bitmasks and last-leaf information if paging is disabled as the metadata will never be used. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-43-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 9a06003811fd..6447d9fe1672 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4646,12 +4646,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - if (is_cr0_pg(context)) + if (is_cr0_pg(context)) { reset_rsvds_bits_mask(vcpu, context); - - update_permission_bitmask(context, false); - update_pkru_bitmask(context); - update_last_nonleaf_level(context); + update_permission_bitmask(context, false); + update_pkru_bitmask(context); + update_last_nonleaf_level(context); + } reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4899,12 +4899,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) else g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - if (is_cr0_pg(g_context)) + if (is_cr0_pg(g_context)) { reset_rsvds_bits_mask(vcpu, g_context); - - update_permission_bitmask(g_context, false); - update_pkru_bitmask(g_context); - update_last_nonleaf_level(g_context); + update_permission_bitmask(g_context, false); + update_pkru_bitmask(g_context); + update_last_nonleaf_level(g_context); + } } void kvm_init_mmu(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 533f9a4b387bf79c722faf0a760a09129d9627f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:28 -0700 Subject: KVM: x86/mmu: Add helper to update paging metadata Consolidate MMU guest metadata updates into a common helper for TDP, shadow, and nested MMUs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-44-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6447d9fe1672..01ab309f8f31 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4507,6 +4507,18 @@ static void update_last_nonleaf_level(struct kvm_mmu *mmu) mmu->last_nonleaf_level++; } +static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + if (!is_cr0_pg(mmu)) + return; + + reset_rsvds_bits_mask(vcpu, mmu); + update_permission_bitmask(mmu, false); + update_pkru_bitmask(mmu); + update_last_nonleaf_level(mmu); +} + static void paging64_init_context_common(struct kvm_mmu *context, int root_level) { @@ -4646,12 +4658,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } - if (is_cr0_pg(context)) { - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(context, false); - update_pkru_bitmask(context); - update_last_nonleaf_level(context); - } + reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4705,12 +4712,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte else paging32_init_context(context); - if (____is_cr0_pg(regs)) { - reset_rsvds_bits_mask(vcpu, context); - update_permission_bitmask(context, false); - update_pkru_bitmask(context); - update_last_nonleaf_level(context); - } + reset_guest_paging_metadata(vcpu, context); context->shadow_root_level = new_role.base.level; reset_shadow_zero_bits_mask(vcpu, context); @@ -4899,12 +4901,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) else g_context->gva_to_gpa = paging32_gva_to_gpa_nested; - if (is_cr0_pg(g_context)) { - reset_rsvds_bits_mask(vcpu, g_context); - update_permission_bitmask(g_context, false); - update_pkru_bitmask(g_context); - update_last_nonleaf_level(g_context); - } + reset_guest_paging_metadata(vcpu, g_context); } void kvm_init_mmu(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From f4bd6f73763a91a0c6fc39974d57034e19f25494 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:29 -0700 Subject: KVM: x86/mmu: Add a helper to calculate root from role_regs Add a helper to calculate the level for non-EPT page tables from the MMU's role_regs. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-45-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 60 +++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 01ab309f8f31..8cf0c1a83716 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -238,6 +238,19 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) return regs; } +static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) +{ + if (!____is_cr0_pg(regs)) + return 0; + else if (____is_efer_lma(regs)) + return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : + PT64_ROOT_4LEVEL; + else if (____is_cr4_pae(regs)) + return PT32E_ROOT_LEVEL; + else + return PT32_ROOT_LEVEL; +} + static inline bool kvm_available_flush_tlb_with_range(void) { return kvm_x86_ops.tlb_remote_flush_with_range; @@ -3949,7 +3962,6 @@ static void nonpaging_init_context(struct kvm_mmu *context) context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = NULL; - context->root_level = 0; context->direct_map = true; } @@ -4519,11 +4531,8 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, update_last_nonleaf_level(mmu); } -static void paging64_init_context_common(struct kvm_mmu *context, - int root_level) +static void paging64_init_context_common(struct kvm_mmu *context) { - context->root_level = root_level; - WARN_ON_ONCE(!is_cr4_pae(context)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; @@ -4532,18 +4541,13 @@ static void paging64_init_context_common(struct kvm_mmu *context, context->direct_map = false; } -static void paging64_init_context(struct kvm_mmu *context, - struct kvm_mmu_role_regs *regs) +static void paging64_init_context(struct kvm_mmu *context) { - int root_level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - - paging64_init_context_common(context, root_level); + paging64_init_context_common(context); } static void paging32_init_context(struct kvm_mmu *context) { - context->root_level = PT32_ROOT_LEVEL; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; @@ -4553,7 +4557,7 @@ static void paging32_init_context(struct kvm_mmu *context) static void paging32E_init_context(struct kvm_mmu *context) { - paging64_init_context_common(context, PT32E_ROOT_LEVEL); + paging64_init_context_common(context); } static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, @@ -4642,21 +4646,16 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; + context->root_level = role_regs_to_root_level(®s); - if (!is_paging(vcpu)) { + if (!is_paging(vcpu)) context->gva_to_gpa = nonpaging_gva_to_gpa; - context->root_level = 0; - } else if (is_long_mode(vcpu)) { - context->root_level = ____is_cr4_la57(®s) ? - PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; + else if (is_long_mode(vcpu)) context->gva_to_gpa = paging64_gva_to_gpa; - } else if (is_pae(vcpu)) { - context->root_level = PT32E_ROOT_LEVEL; + else if (is_pae(vcpu)) context->gva_to_gpa = paging64_gva_to_gpa; - } else { - context->root_level = PT32_ROOT_LEVEL; + else context->gva_to_gpa = paging32_gva_to_gpa; - } reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); @@ -4706,11 +4705,12 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte if (!____is_cr0_pg(regs)) nonpaging_init_context(context); else if (____is_efer_lma(regs)) - paging64_init_context(context, regs); + paging64_init_context(context); else if (____is_cr4_pae(regs)) paging32E_init_context(context); else paging32_init_context(context); + context->root_level = role_regs_to_root_level(regs); reset_guest_paging_metadata(vcpu, context); context->shadow_root_level = new_role.base.level; @@ -4849,17 +4849,7 @@ kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) * to "true" to try to detect bogus usage of the nested MMU. */ role.base.direct = true; - - if (!____is_cr0_pg(regs)) - role.base.level = 0; - else if (____is_efer_lma(regs)) - role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : - PT64_ROOT_4LEVEL; - else if (____is_cr4_pae(regs)) - role.base.level = PT32E_ROOT_LEVEL; - else - role.base.level = PT32_ROOT_LEVEL; - + role.base.level = role_regs_to_root_level(regs); return role; } -- cgit v1.2.3 From fe660f7244d7e237ab7726813dc9aec8e94900d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:30 -0700 Subject: KVM: x86/mmu: Collapse 32-bit PAE and 64-bit statements for helpers Skip paging32E_init_context() and paging64_init_context_common() and go directly to paging64_init_context() (was the common version) now that the relevant flows don't need to distinguish between 64-bit PAE and 32-bit PAE for other reasons. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-46-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8cf0c1a83716..08ac4e451b95 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4531,9 +4531,8 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, update_last_nonleaf_level(mmu); } -static void paging64_init_context_common(struct kvm_mmu *context) +static void paging64_init_context(struct kvm_mmu *context) { - WARN_ON_ONCE(!is_cr4_pae(context)); context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; @@ -4541,11 +4540,6 @@ static void paging64_init_context_common(struct kvm_mmu *context) context->direct_map = false; } -static void paging64_init_context(struct kvm_mmu *context) -{ - paging64_init_context_common(context); -} - static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; @@ -4555,11 +4549,6 @@ static void paging32_init_context(struct kvm_mmu *context) context->direct_map = false; } -static void paging32E_init_context(struct kvm_mmu *context) -{ - paging64_init_context_common(context); -} - static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) { @@ -4650,8 +4639,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) if (!is_paging(vcpu)) context->gva_to_gpa = nonpaging_gva_to_gpa; - else if (is_long_mode(vcpu)) - context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) context->gva_to_gpa = paging64_gva_to_gpa; else @@ -4704,10 +4691,8 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte if (!____is_cr0_pg(regs)) nonpaging_init_context(context); - else if (____is_efer_lma(regs)) - paging64_init_context(context); else if (____is_cr4_pae(regs)) - paging32E_init_context(context); + paging64_init_context(context); else paging32_init_context(context); context->root_level = role_regs_to_root_level(regs); -- cgit v1.2.3 From 36f267871edceafbfbbc5d570c34c089a2afa1c1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:31 -0700 Subject: KVM: x86/mmu: Use MMU's role to determine PTTYPE Use the MMU's role instead of vCPU state or role_regs to determine the PTTYPE, i.e. which helpers to wire up. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-47-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 08ac4e451b95..4676d696b909 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4637,9 +4637,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->inject_page_fault = kvm_inject_page_fault; context->root_level = role_regs_to_root_level(®s); - if (!is_paging(vcpu)) + if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; - else if (is_pae(vcpu)) + else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; else context->gva_to_gpa = paging32_gva_to_gpa; @@ -4689,9 +4689,9 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte context->mmu_role.as_u64 = new_role.as_u64; - if (!____is_cr0_pg(regs)) + if (!is_cr0_pg(context)) nonpaging_init_context(context); - else if (____is_cr4_pae(regs)) + else if (is_cr4_pae(context)) paging64_init_context(context); else paging32_init_context(context); -- cgit v1.2.3 From 961f84457cd4e2fc479e59d015f1d292ec30373b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:32 -0700 Subject: KVM: x86/mmu: Add helpers to do full reserved SPTE checks w/ generic MMU Extract the reserved SPTE check and print helpers in get_mmio_spte() to new helpers so that KVM can also WARN on reserved badness when making a SPTE. Tag the checking helper with __always_inline to improve the probability of the compiler generating optimal code for the checking loop, e.g. gcc appears to avoid using %rbp when the helper is tagged with a vanilla "inline". No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-48-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 23 ++--------------------- arch/x86/kvm/mmu/spte.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4676d696b909..ad025059a041 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3594,19 +3594,6 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } -static bool -__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) -{ - int bit7 = (pte >> 7) & 1; - - return pte & rsvd_check->rsvd_bits_mask[bit7][level-1]; -} - -static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte) -{ - return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); -} - static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* @@ -3684,13 +3671,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the - * reserved bit and EPT's invalid memtype/XWR checks to avoid - * adding a Jcc in the loop. - */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | - __is_rsvd_bits_set(rsvd_check, sptes[level], level); + reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", @@ -3698,7 +3679,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, - rsvd_check->rsvd_bits_mask[(sptes[level] >> 7) & 1][level-1]); + get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index bca0ba11cccf..7a5ce9314107 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -293,6 +293,38 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, + int level) +{ + int bit7 = (pte >> 7) & 1; + + return rsvd_check->rsvd_bits_mask[bit7][level-1]; +} + +static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, + u64 pte, int level) +{ + return pte & get_rsvd_bits(rsvd_check, pte, level); +} + +static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, + u64 pte) +{ + return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f); +} + +static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, + u64 spte, int level) +{ + /* + * Use a bitwise-OR instead of a logical-OR to aggregate the reserved + * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc + * (this is extremely unlikely to be short-circuited as true). + */ + return __is_bad_mt_xwr(rsvd_check, spte) | + __is_rsvd_bits_set(rsvd_check, spte, level); +} + static inline bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & shadow_host_writable_mask) && -- cgit v1.2.3 From 3b77daa5efe1cb343ee498ade6ee58c8ada58074 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:33 -0700 Subject: KVM: x86/mmu: WARN on any reserved SPTE value when making a valid SPTE Replace make_spte()'s WARN on a collision with the magic MMIO value with a generic WARN on reserved bits being set (including EPT's reserved WX combination). Warning on any reserved bits covers MMIO, A/D tracking bits with PAE paging, and in theory any future goofs that are introduced. Opportunistically convert to ONCE behavior to avoid spamming the kernel log, odds are very good that if KVM screws up one SPTE, it will botch all SPTEs for the same MMU. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-49-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 246e61e0771e..3e97cdb13eb7 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -175,7 +175,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, spte = mark_spte_for_access_track(spte); out: - WARN_ON(is_mmio_spte(spte)); + WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), + "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, + get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + *new_spte = spte; return ret; } -- cgit v1.2.3 From 616007c866a250143e95ea7a696bd924df251f8a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:34 -0700 Subject: KVM: x86: Enhance comments for MMU roles and nested transition trickiness Expand the comments for the MMU roles. The interactions with gfn_track PGD reuse in particular are hairy. Regarding PGD reuse, add comments in the nested virtualization flows to call out why kvm_init_mmu() is unconditionally called even when nested TDP is used. Cc: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-50-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 57 +++++++++++++++++++++++++++++++++-------- arch/x86/kvm/svm/nested.c | 1 + arch/x86/kvm/vmx/nested.c | 1 + 3 files changed, 49 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f4f6ad7405b..f033ecf43d4c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -269,12 +269,36 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; /* - * the pages used as guest page table on soft mmu are tracked by - * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used - * by indirect shadow page can not be more than 15 bits. + * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page + * also includes TDP pages) to determine whether or not a page can be used in + * the given MMU context. This is a subset of the overall kvm_mmu_role to + * minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating + * 2 bytes per gfn instead of 4 bytes per gfn. * - * Currently, we used 14 bits that are @level, @gpte_is_8_bytes, @quadrant, @access, - * @efer_nx, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. + * Indirect upper-level shadow pages are tracked for write-protection via + * gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create + * more than 2^16-1 upper-level shadow pages at a single gfn, otherwise + * gfn_track will overflow and explosions will ensure. + * + * A unique shadow page (SP) for a gfn is created if and only if an existing SP + * cannot be reused. The ability to reuse a SP is tracked by its role, which + * incorporates various mode bits and properties of the SP. Roughly speaking, + * the number of unique SPs that can theoretically be created is 2^n, where n + * is the number of bits that are used to compute the role. + * + * But, even though there are 18 bits in the mask below, not all combinations + * of modes and flags are possible. The maximum number of possible upper-level + * shadow pages for a single gfn is in the neighborhood of 2^13. + * + * - invalid shadow pages are not accounted. + * - level is effectively limited to four combinations, not 16 as the number + * bits would imply, as 4k SPs are not tracked (allowed to go unsync). + * - level is effectively unused for non-PAE paging because there is exactly + * one upper level (see 4k SP exception above). + * - quadrant is used only for non-PAE paging and is exclusive with + * gpte_is_8_bytes. + * - execonly and ad_disabled are used only for nested EPT, which makes it + * exclusive with quadrant. */ union kvm_mmu_page_role { u32 word; @@ -303,13 +327,26 @@ union kvm_mmu_page_role { }; }; -union kvm_mmu_extended_role { /* - * This structure complements kvm_mmu_page_role caching everything needed for - * MMU configuration. If nothing in both these structures changed, MMU - * re-configuration can be skipped. @valid bit is set on first usage so we don't - * treat all-zero structure as valid data. + * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties + * relevant to the current MMU configuration. When loading CR0, CR4, or EFER, + * including on nested transitions, if nothing in the full role changes then + * MMU re-configuration can be skipped. @valid bit is set on first usage so we + * don't treat all-zero structure as valid data. + * + * The properties that are tracked in the extended role but not the page role + * are for things that either (a) do not affect the validity of the shadow page + * or (b) are indirectly reflected in the shadow page's role. For example, + * CR4.PKE only affects permission checks for software walks of the guest page + * tables (because KVM doesn't support Protection Keys with shadow paging), and + * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. + * + * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. + * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and + * SMAP, but the MMU's permission checks for software walks need to be SMEP and + * SMAP aware regardless of CR0.WP. */ +union kvm_mmu_extended_role { u32 word; struct { unsigned int valid:1; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a9e3b0736c20..21d03e3a5dfd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -424,6 +424,7 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); return 0; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fa3f50f0a3fa..1a52134b0c42 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1098,6 +1098,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ kvm_init_mmu(vcpu); return 0; -- cgit v1.2.3 From 7cd138db5cae0dac295714b4412a9b44fb4f4e65 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:35 -0700 Subject: KVM: x86/mmu: Optimize and clean up so called "last nonleaf level" logic Drop the pre-computed last_nonleaf_level, which is arguably wrong and at best confusing. Per the comment: Can have large pages at levels 2..last_nonleaf_level-1. the intent of the variable would appear to be to track what levels can _legally_ have large pages, but that intent doesn't align with reality. The computed value will be wrong for 5-level paging, or if 1gb pages are not supported. The flawed code is not a problem in practice, because except for 32-bit PSE paging, bit 7 is reserved if large pages aren't supported at the level. Take advantage of this invariant and simply omit the level magic math for 64-bit page tables (including PAE). For 32-bit paging (non-PAE), the adjustments are needed purely because bit 7 is ignored if PSE=0. Retain that logic as is, but make is_last_gpte() unique per PTTYPE so that the PSE check is avoided for PAE and EPT paging. In the spirit of avoiding branches, bump the "last nonleaf level" for 32-bit PSE paging by adding the PSE bit itself. Note, bit 7 is ignored or has other meaning in CR3/EPTP, but despite FNAME(walk_addr_generic) briefly grabbing CR3/EPTP in "pte", they are not PTEs and will blow up all the other gpte helpers. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-51-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/mmu/mmu.c | 31 ------------------------------- arch/x86/kvm/mmu/paging_tmpl.h | 31 ++++++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f033ecf43d4c..3cd496c8acb8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -457,9 +457,6 @@ struct kvm_mmu { struct rsvd_bits_validate guest_rsvd_check; - /* Can have large pages at levels 2..last_nonleaf_level-1. */ - u8 last_nonleaf_level; - u64 pdptrs[4]; /* pae */ }; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index ad025059a041..417f81c004da 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4071,26 +4071,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, - unsigned level, unsigned gpte) -{ - /* - * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. - * If it is clear, there are no large pages at this level, so clear - * PT_PAGE_SIZE_MASK in gpte if that is the case. - */ - gpte &= level - mmu->last_nonleaf_level; - - /* - * PG_LEVEL_4K always terminates. The RHS has bit 7 set - * iff level <= PG_LEVEL_4K, which for our purpose means - * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. - */ - gpte |= level - PG_LEVEL_4K - 1; - - return gpte & PT_PAGE_SIZE_MASK; -} - #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" @@ -4491,15 +4471,6 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) } } -static void update_last_nonleaf_level(struct kvm_mmu *mmu) -{ - unsigned root_level = mmu->root_level; - - mmu->last_nonleaf_level = root_level; - if (root_level == PT32_ROOT_LEVEL && is_cr4_pse(mmu)) - mmu->last_nonleaf_level++; -} - static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { @@ -4509,7 +4480,6 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, mmu); update_permission_bitmask(mmu, false); update_pkru_bitmask(mmu); - update_last_nonleaf_level(mmu); } static void paging64_init_context(struct kvm_mmu *context) @@ -4783,7 +4753,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_last_nonleaf_level(context); update_pkru_bitmask(context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index c92e712607b6..75c3fe966e81 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -305,6 +305,35 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) return pkeys; } +static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, + unsigned int level, unsigned int gpte) +{ + /* + * For EPT and PAE paging (both variants), bit 7 is either reserved at + * all level or indicates a huge page (ignoring CR3/EPTP). In either + * case, bit 7 being set terminates the walk. + */ +#if PTTYPE == 32 + /* + * 32-bit paging requires special handling because bit 7 is ignored if + * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is + * greater than the last level for which bit 7 is the PAGE_SIZE bit. + * + * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 + * is not reserved and does not indicate a large page at this level, + * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse); +#endif + /* + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PG_LEVEL_4K - 1; + + return gpte & PT_PAGE_SIZE_MASK; +} /* * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ @@ -421,7 +450,7 @@ retry_walk: /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); - } while (!is_last_gpte(mmu, walker->level, pte)); + } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; -- cgit v1.2.3 From f82fdaf536ee6de36e3a7b4764f17b81afb8ef93 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:36 -0700 Subject: KVM: x86/mmu: Drop redundant rsvd bits reset for nested NPT Drop the extra reset of shadow_zero_bits in the nested NPT flow now that shadow_mmu_init_context computes the correct level for nested NPT. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-52-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 417f81c004da..690f560341a2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4693,12 +4693,6 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); shadow_mmu_init_context(vcpu, context, ®s, new_role); - - /* - * Redo the shadow bits, the reset done by shadow_mmu_init_context() - * (above) may use the wrong shadow_root_level. - */ - reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); -- cgit v1.2.3 From fdaa293598f908adb945001dabb305225144e183 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:37 -0700 Subject: KVM: x86/mmu: Get CR0.WP from MMU, not vCPU, in shadow page fault Use the current MMU instead of vCPU state to query CR0.WP when handling a page fault. In the nested NPT case, the current CR0.WP reflects L2, whereas the page fault is shadowing L1's NPT. Practically speaking, this is a nop a NPT walks are always user faults, but fix it up for consistency. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-53-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 5 ----- arch/x86/kvm/mmu/paging_tmpl.h | 5 ++--- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 62844bacd13f..83e6c6965f1e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -165,11 +165,6 @@ static inline bool is_writable_pte(unsigned long pte) return pte & PT_WRITABLE_MASK; } -static inline bool is_write_protection(struct kvm_vcpu *vcpu) -{ - return kvm_read_cr0_bits(vcpu, X86_CR0_WP); -} - /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 75c3fe966e81..2f5a0e8d05ed 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -795,7 +795,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, bool self_changed = false; if (!(walker->pte_access & ACC_WRITE_MASK || - (!is_write_protection(vcpu) && !user_fault))) + (!is_cr0_wp(vcpu->arch.mmu) && !user_fault))) return false; for (level = walker->level; level <= walker->max_level; level++) { @@ -893,8 +893,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * we will cache the incorrect access into mmio spte. */ if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_write_protection(vcpu) && !user_fault && - !is_noslot_pfn(pfn)) { + !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; -- cgit v1.2.3 From 9a65d0b70fa06ae46b9f8ab7dc8e6b3c6f4661ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:38 -0700 Subject: KVM: x86/mmu: Get CR4.SMEP from MMU, not vCPU, in shadow page fault Use the current MMU instead of vCPU state to query CR4.SMEP when handling a page fault. In the nested NPT case, the current CR4.SMEP reflects L2, whereas the page fault is shadowing L1's NPT, which uses L1's hCR4. Practically speaking, this is a nop a NPT walks are always user faults, i.e. this code will never be reached, but fix it up for consistency. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-54-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 2f5a0e8d05ed..490a028ddabe 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -903,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, * then we should prevent the kernel from executing it * if SMEP is enabled. */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } -- cgit v1.2.3 From 27de925044e18eb056d6157305c841b1408621b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 22 Jun 2021 10:57:39 -0700 Subject: KVM: x86/mmu: Let guest use GBPAGES if supported in hardware and TDP is on Let the guest use 1g hugepages if TDP is enabled and the host supports GBPAGES, KVM can't actively prevent the guest from using 1g pages in this case since they can't be disabled in the hardware page walker. While injecting a page fault if a bogus 1g page is encountered during a software page walk is perfectly reasonable since KVM is simply honoring userspace's vCPU model, doing so arguably doesn't provide any meaningful value, and at worst will be horribly confusing as the guest will see inconsistent behavior and seemingly spurious page faults. Signed-off-by: Sean Christopherson Message-Id: <20210622175739.3610207-55-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 690f560341a2..00732757cc60 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4174,13 +4174,28 @@ __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, } } +static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) +{ + /* + * If TDP is enabled, let the guest use GBPAGES if they're supported in + * hardware. The hardware page walker doesn't let KVM disable GBPAGES, + * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA + * walk for performance and complexity reasons. Not to mention KVM + * _can't_ solve the problem because GVA->GPA walks aren't visible to + * KVM once a TDP translation is installed. Mimic hardware behavior so + * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. + */ + return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : + guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); +} + static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->root_level, is_efer_nx(context), - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), + guest_can_use_gbpages(vcpu), is_cr4_pse(context), guest_cpuid_is_amd_or_hygon(vcpu)); } @@ -4259,8 +4274,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->shadow_root_level, uses_nx, - guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse, is_amd); + guest_can_use_gbpages(vcpu), is_pse, is_amd); if (!shadow_me_mask) return; -- cgit v1.2.3 From 19238e75bd8ed8ffe784bf5b37586e77b2093742 Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Mon, 10 May 2021 07:48:33 -0700 Subject: kvm: x86: Allow userspace to handle emulation errors Add a fallback mechanism to the in-kernel instruction emulator that allows userspace the opportunity to process an instruction the emulator was unable to. When the in-kernel instruction emulator fails to process an instruction it will either inject a #UD into the guest or exit to userspace with exit reason KVM_INTERNAL_ERROR. This is because it does not know how to proceed in an appropriate manner. This feature lets userspace get involved to see if it can figure out a better path forward. Signed-off-by: Aaron Lewis Reviewed-by: David Edmondson Message-Id: <20210510144834.658457-2-aaronlewis@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 20 ++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++---- include/uapi/linux/kvm.h | 23 +++++++++++++++++++++++ 4 files changed, 85 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 5d8db4922df6..3b6e3b1628b4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6546,6 +6546,7 @@ KVM_RUN_BUS_LOCK flag is used to distinguish between them. This capability can be used to check / enable 2nd DAWR feature provided by POWER10 processor. + 7.24 KVM_CAP_VM_COPY_ENC_CONTEXT_FROM ------------------------------------- @@ -6603,6 +6604,25 @@ present in the "ibm,hypertas-functions" device-tree property. This capability is enabled for hypervisors on platforms like POWER9 that support radix MMU. +7.27 KVM_CAP_EXIT_ON_EMULATION_FAILURE +-------------------------------------- + +:Architectures: x86 +:Parameters: args[0] whether the feature should be enabled or not + +When this capability is enabled, an emulation failure will result in an exit +to userspace with KVM_INTERNAL_ERROR (except when the emulator was invoked +to handle a VMware backdoor instruction). Furthermore, KVM will now provide up +to 15 instruction bytes for any exit to userspace resulting from an emulation +failure. When these exits to userspace occur use the emulation_failure struct +instead of the internal struct. They both have the same layout, but the +emulation_failure struct matches the content better. It also explicitly +defines the 'flags' field which is used to describe the fields in the struct +that are valid (ie: if KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES is +set in the 'flags' field then both 'insn_size' and 'insn_bytes' have valid data +in them.) + + 8. Other capabilities. ====================== diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3cd496c8acb8..c9ec5c76c438 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1114,6 +1114,12 @@ struct kvm_arch { bool exception_payload_enabled; bool bus_lock_detection_enabled; + /* + * If exit_on_emulation_error is set, and the in-kernel instruction + * emulator fails to emulate an instruction, allow userspace + * the opportunity to look at it. + */ + bool exit_on_emulation_error; /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7c7b2b28de7..17468d983fbd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4010,6 +4010,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -5649,6 +5650,13 @@ split_irqchip_unlock: kvm->arch.hypercall_exit_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + r = -EINVAL; + if (cap->args[0] & ~1) + break; + kvm->arch.exit_on_emulation_error = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -7444,8 +7452,33 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; + struct kvm_run *run = vcpu->run; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; + run->emulation_failure.ndata = 0; + run->emulation_failure.flags = 0; + + if (insn_size) { + run->emulation_failure.ndata = 3; + run->emulation_failure.flags |= + KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; + run->emulation_failure.insn_size = insn_size; + memset(run->emulation_failure.insn_bytes, 0x90, + sizeof(run->emulation_failure.insn_bytes)); + memcpy(run->emulation_failure.insn_bytes, + ctxt->fetch.data, insn_size); + } +} + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { + struct kvm *kvm = vcpu->kvm; + ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); @@ -7454,10 +7487,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } - if (emulation_type & EMULTYPE_SKIP) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (kvm->arch.exit_on_emulation_error || + (emulation_type & EMULTYPE_SKIP)) { + prepare_emulation_failure_exit(vcpu); return 0; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f1ba602260f6..68c9e6d8bbda 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -280,6 +280,9 @@ struct kvm_xen_exit { /* Encounter unexpected vm-exit reason */ #define KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON 4 +/* Flags that describe what fields in emulation_failure hold valid data. */ +#define KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES (1ULL << 0) + /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ struct kvm_run { /* in */ @@ -383,6 +386,25 @@ struct kvm_run { __u32 ndata; __u64 data[16]; } internal; + /* + * KVM_INTERNAL_ERROR_EMULATION + * + * "struct emulation_failure" is an overlay of "struct internal" + * that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of + * KVM_EXIT_INTERNAL_ERROR. Note, unlike other internal error + * sub-types, this struct is ABI! It also needs to be backwards + * compatible with "struct internal". Take special care that + * "ndata" is correct, that new fields are enumerated in "flags", + * and that each flag enumerates fields that are 64-bit aligned + * and sized (so that ndata+internal.data[] is valid/accurate). + */ + struct { + __u32 suberror; + __u32 ndata; + __u64 flags; + __u8 insn_size; + __u8 insn_bytes[15]; + } emulation_failure; /* KVM_EXIT_OSI */ struct { __u64 gprs[32]; @@ -1088,6 +1110,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_EXIT_HYPERCALL 201 #define KVM_CAP_PPC_RPT_INVALIDATE 202 #define KVM_CAP_BINARY_STATS_FD 203 +#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 88213da2351479c529c368a9b763c4d52f02255b Mon Sep 17 00:00:00 2001 From: Aaron Lewis Date: Wed, 23 Jun 2021 20:34:27 +0000 Subject: kvm: x86: disable the narrow guest module parameter on unload When the kvm_intel module unloads the module parameter 'allow_smaller_maxphyaddr' is not cleared because the backing variable is defined in the kvm module. As a result, if the module parameter's state was set before kvm_intel unloads, it will also be set when it reloads. Explicitly clear the state in vmx_exit() to prevent this from happening. Signed-off-by: Aaron Lewis Message-Id: <20210623203426.1891402-1-aaronlewis@google.com> Signed-off-by: Paolo Bonzini Reviewed-by: Jim Mattson --- arch/x86/kvm/vmx/vmx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5a1067c42f3a..104bbbe2dfd0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7925,6 +7925,8 @@ static void vmx_exit(void) } #endif vmx_cleanup_l1d_flush(); + + allow_smaller_maxphyaddr = false; } module_exit(vmx_exit); -- cgit v1.2.3 From a01b45e9d34d278129296daf91c4771143fa9dd9 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 23 Jun 2021 14:29:55 +0300 Subject: KVM: x86: rename apic_access_page_done to apic_access_memslot_enabled This better reflects the purpose of this variable on AMD, since on AMD the AVIC's memory slot can be enabled and disabled dynamically. Signed-off-by: Maxim Levitsky Message-Id: <20210623113002.111448-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/vmx/vmx.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c9ec5c76c438..974cbfb1eefe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1054,7 +1054,7 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; - bool apic_access_page_done; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a9abed054cd5..1d01da64c333 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -236,7 +236,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT * memory region. So, we need to ensure that kvm->mm == current->mm. */ - if ((kvm->arch.apic_access_page_done == activate) || + if ((kvm->arch.apic_access_memslot_enabled == activate) || (kvm->mm != current->mm)) goto out; @@ -249,7 +249,7 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) goto out; } - kvm->arch.apic_access_page_done = activate; + kvm->arch.apic_access_memslot_enabled = activate; out: mutex_unlock(&kvm->slots_lock); return r; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 104bbbe2dfd0..927a552393b9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3621,7 +3621,7 @@ static int alloc_apic_access_page(struct kvm *kvm) int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) + if (kvm->arch.apic_access_memslot_enabled) goto out; hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); @@ -3641,7 +3641,7 @@ static int alloc_apic_access_page(struct kvm *kvm) * is able to migrate it. */ put_page(page); - kvm->arch.apic_access_page_done = true; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return ret; -- cgit v1.2.3 From bce29ac9ce0bb0b0b146b687ab978378c21e9078 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 22 Jun 2021 16:42:27 +0200 Subject: trace: Add osnoise tracer In the context of high-performance computing (HPC), the Operating System Noise (*osnoise*) refers to the interference experienced by an application due to activities inside the operating system. In the context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the system. Moreover, hardware-related jobs can also cause noise, for example, via SMIs. The osnoise tracer leverages the hwlat_detector by running a similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing all the sources of *osnoise* during its execution. Using the same approach of hwlat, osnoise takes note of the entry and exit point of any source of interferences, increasing a per-cpu interference counter. The osnoise tracer also saves an interference counter for each source of interference. The interference counter for NMI, IRQs, SoftIRQs, and threads is increased anytime the tool observes these interferences' entry events. When a noise happens without any interference from the operating system level, the hardware noise counter increases, pointing to a hardware-related noise. In this way, osnoise can account for any source of interference. At the end of the period, the osnoise tracer prints the sum of all noise, the max single noise, the percentage of CPU available for the thread, and the counters for the noise sources. Usage Write the ASCII text "osnoise" into the current_tracer file of the tracing system (generally mounted at /sys/kernel/tracing). For example:: [root@f32 ~]# cd /sys/kernel/tracing/ [root@f32 tracing]# echo osnoise > current_tracer It is possible to follow the trace by reading the trace trace file:: [root@f32 tracing]# cat trace # tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the tracer prints a message at the end of each period for each CPU that is running an osnoise/CPU thread. The osnoise specific fields report: - The RUNTIME IN USE reports the amount of time in microseconds that the osnoise thread kept looping reading the time. - The NOISE IN US reports the sum of noise in microseconds observed by the osnoise tracer during the associated runtime. - The % OF CPU AVAILABLE reports the percentage of CPU available for the osnoise thread during the runtime window. - The MAX SINGLE NOISE IN US reports the maximum single noise observed during the runtime window. - The Interference counters display how many each of the respective interference happened during the runtime window. Note that the example above shows a high number of HW noise samples. The reason being is that this sample was taken on a virtual machine, and the host interference is detected as a hardware interference. Tracer options The tracer has a set of options inside the osnoise directory, they are: - osnoise/cpus: CPUs at which a osnoise thread will execute. - osnoise/period_us: the period of the osnoise thread. - osnoise/runtime_us: how long an osnoise thread will look for noise. - osnoise/stop_tracing_us: stop the system tracing if a single noise higher than the configured value happens. Writing 0 disables this option. - osnoise/stop_tracing_total_us: stop the system tracing if total noise higher than the configured value happens. Writing 0 disables this option. - tracing_threshold: the minimum delta between two time() reads to be considered as noise, in us. When set to 0, the default value will be used, which is currently 5 us. Additional Tracing In addition to the tracer, a set of tracepoints were added to facilitate the identification of the osnoise source. - osnoise:sample_threshold: printed anytime a noise is higher than the configurable tolerance_ns. - osnoise:nmi_noise: noise from NMI, including the duration. - osnoise:irq_noise: noise from an IRQ, including the duration. - osnoise:softirq_noise: noise from a SoftIRQ, including the duration. - osnoise:thread_noise: noise from a thread, including the duration. Note that all the values are *net values*. For example, if while osnoise is running, another thread preempts the osnoise thread, it will start a thread_noise duration at the start. Then, an IRQ takes place, preempting the thread_noise, starting a irq_noise. When the IRQ ends its execution, it will compute its duration, and this duration will be subtracted from the thread_noise, in such a way as to avoid the double accounting of the IRQ execution. This logic is valid for all sources of noise. Here is one example of the usage of these tracepoints:: osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8723 ns interferences 2 In this example, a noise sample of 8 microseconds was reported in the last line, pointing to two interferences. Looking backward in the trace, the two previous entries were about the migration thread running after a timer IRQ execution. The first event is not part of the noise because it took place one millisecond before. It is worth noticing that the sum of the duration reported in the tracepoints is smaller than eight us reported in the sample_threshold. The reason roots in the overhead of the entry and exit code that happens before and after any interference execution. This justifies the dual approach: measuring thread and tracing. Link: https://lkml.kernel.org/r/e649467042d60e7b62714c9c6751a56299d15119.1624372313.git.bristot@redhat.com Cc: Phil Auld Cc: Sebastian Andrzej Siewior Cc: Kate Carcia Cc: Jonathan Corbet Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Clark Willaims Cc: John Kacur Cc: Juri Lelli Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira [ Made the following functions static: trace_irqentry_callback() trace_irqexit_callback() trace_intel_irqentry_callback() trace_intel_irqexit_callback() Added to include/trace.h: osnoise_arch_register() osnoise_arch_unregister() Fixed define logic for LATENCY_FS_NOTIFY Reported-by: kernel test robot ] Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/index.rst | 1 + Documentation/trace/osnoise-tracer.rst | 152 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/trace.c | 237 ++++++ include/linux/ftrace_irq.h | 13 + include/linux/trace.h | 5 + include/trace/events/osnoise.h | 142 ++++ kernel/trace/Kconfig | 34 + kernel/trace/Makefile | 1 + kernel/trace/trace.h | 9 +- kernel/trace/trace_entries.h | 25 + kernel/trace/trace_osnoise.c | 1384 ++++++++++++++++++++++++++++++++ kernel/trace/trace_output.c | 72 +- 13 files changed, 2072 insertions(+), 4 deletions(-) create mode 100644 Documentation/trace/osnoise-tracer.rst create mode 100644 arch/x86/kernel/trace.c create mode 100644 include/trace/events/osnoise.h create mode 100644 kernel/trace/trace_osnoise.c (limited to 'arch/x86') diff --git a/Documentation/trace/index.rst b/Documentation/trace/index.rst index f634b36fd3aa..608107b27cc0 100644 --- a/Documentation/trace/index.rst +++ b/Documentation/trace/index.rst @@ -23,6 +23,7 @@ Linux Tracing Technologies histogram-design boottime-trace hwlat_detector + osnoise-tracer intel_th ring-buffer-design stm diff --git a/Documentation/trace/osnoise-tracer.rst b/Documentation/trace/osnoise-tracer.rst new file mode 100644 index 000000000000..37a3c10fb216 --- /dev/null +++ b/Documentation/trace/osnoise-tracer.rst @@ -0,0 +1,152 @@ +============== +OSNOISE Tracer +============== + +In the context of high-performance computing (HPC), the Operating System +Noise (*osnoise*) refers to the interference experienced by an application +due to activities inside the operating system. In the context of Linux, +NMIs, IRQs, SoftIRQs, and any other system thread can cause noise to the +system. Moreover, hardware-related jobs can also cause noise, for example, +via SMIs. + +hwlat_detector is one of the tools used to identify the most complex +source of noise: *hardware noise*. + +In a nutshell, the hwlat_detector creates a thread that runs +periodically for a given period. At the beginning of a period, the thread +disables interrupt and starts sampling. While running, the hwlatd +thread reads the time in a loop. As interrupts are disabled, threads, +IRQs, and SoftIRQs cannot interfere with the hwlatd thread. Hence, the +cause of any gap between two different reads of the time roots either on +NMI or in the hardware itself. At the end of the period, hwlatd enables +interrupts and reports the max observed gap between the reads. It also +prints a NMI occurrence counter. If the output does not report NMI +executions, the user can conclude that the hardware is the culprit for +the latency. The hwlat detects the NMI execution by observing +the entry and exit of a NMI. + +The osnoise tracer leverages the hwlat_detector by running a +similar loop with preemption, SoftIRQs and IRQs enabled, thus allowing +all the sources of *osnoise* during its execution. Using the same approach +of hwlat, osnoise takes note of the entry and exit point of any +source of interferences, increasing a per-cpu interference counter. The +osnoise tracer also saves an interference counter for each source of +interference. The interference counter for NMI, IRQs, SoftIRQs, and +threads is increased anytime the tool observes these interferences' entry +events. When a noise happens without any interference from the operating +system level, the hardware noise counter increases, pointing to a +hardware-related noise. In this way, osnoise can account for any +source of interference. At the end of the period, the osnoise tracer +prints the sum of all noise, the max single noise, the percentage of CPU +available for the thread, and the counters for the noise sources. + +Usage +----- + +Write the ASCII text "osnoise" into the current_tracer file of the +tracing system (generally mounted at /sys/kernel/tracing). + +For example:: + + [root@f32 ~]# cd /sys/kernel/tracing/ + [root@f32 tracing]# echo osnoise > current_tracer + +It is possible to follow the trace by reading the trace trace file:: + + [root@f32 tracing]# cat trace + # tracer: osnoise + # + # _-----=> irqs-off + # / _----=> need-resched + # | / _---=> hardirq/softirq + # || / _--=> preempt-depth MAX + # || / SINGLE Interference counters: + # |||| RUNTIME NOISE % OF CPU NOISE +-----------------------------+ + # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD + # | | | |||| | | | | | | | | | | + <...>-859 [000] .... 81.637220: 1000000 190 99.98100 9 18 0 1007 18 1 + <...>-860 [001] .... 81.638154: 1000000 656 99.93440 74 23 0 1006 16 3 + <...>-861 [002] .... 81.638193: 1000000 5675 99.43250 202 6 0 1013 25 21 + <...>-862 [003] .... 81.638242: 1000000 125 99.98750 45 1 0 1011 23 0 + <...>-863 [004] .... 81.638260: 1000000 1721 99.82790 168 7 0 1002 49 41 + <...>-864 [005] .... 81.638286: 1000000 263 99.97370 57 6 0 1006 26 2 + <...>-865 [006] .... 81.638302: 1000000 109 99.98910 21 3 0 1006 18 1 + <...>-866 [007] .... 81.638326: 1000000 7816 99.21840 107 8 0 1016 39 19 + +In addition to the regular trace fields (from TASK-PID to TIMESTAMP), the +tracer prints a message at the end of each period for each CPU that is +running an osnoise/ thread. The osnoise specific fields report: + + - The RUNTIME IN USE reports the amount of time in microseconds that + the osnoise thread kept looping reading the time. + - The NOISE IN US reports the sum of noise in microseconds observed + by the osnoise tracer during the associated runtime. + - The % OF CPU AVAILABLE reports the percentage of CPU available for + the osnoise thread during the runtime window. + - The MAX SINGLE NOISE IN US reports the maximum single noise observed + during the runtime window. + - The Interference counters display how many each of the respective + interference happened during the runtime window. + +Note that the example above shows a high number of HW noise samples. +The reason being is that this sample was taken on a virtual machine, +and the host interference is detected as a hardware interference. + +Tracer options +--------------------- + +The tracer has a set of options inside the osnoise directory, they are: + + - osnoise/cpus: CPUs at which a osnoise thread will execute. + - osnoise/period_us: the period of the osnoise thread. + - osnoise/runtime_us: how long an osnoise thread will look for noise. + - osnoise/stop_tracing_us: stop the system tracing if a single noise + higher than the configured value happens. Writing 0 disables this + option. + - osnoise/stop_tracing_total_us: stop the system tracing if total noise + higher than the configured value happens. Writing 0 disables this + option. + - tracing_threshold: the minimum delta between two time() reads to be + considered as noise, in us. When set to 0, the default value will + will be used, which is currently 5 us. + +Additional Tracing +------------------ + +In addition to the tracer, a set of tracepoints were added to +facilitate the identification of the osnoise source. + + - osnoise:sample_threshold: printed anytime a noise is higher than + the configurable tolerance_ns. + - osnoise:nmi_noise: noise from NMI, including the duration. + - osnoise:irq_noise: noise from an IRQ, including the duration. + - osnoise:softirq_noise: noise from a SoftIRQ, including the + duration. + - osnoise:thread_noise: noise from a thread, including the duration. + +Note that all the values are *net values*. For example, if while osnoise +is running, another thread preempts the osnoise thread, it will start a +thread_noise duration at the start. Then, an IRQ takes place, preempting +the thread_noise, starting a irq_noise. When the IRQ ends its execution, +it will compute its duration, and this duration will be subtracted from +the thread_noise, in such a way as to avoid the double accounting of the +IRQ execution. This logic is valid for all sources of noise. + +Here is one example of the usage of these tracepoints:: + + osnoise/8-961 [008] d.h. 5789.857532: irq_noise: local_timer:236 start 5789.857529929 duration 1845 ns + osnoise/8-961 [008] dNh. 5789.858408: irq_noise: local_timer:236 start 5789.858404871 duration 2848 ns + migration/8-54 [008] d... 5789.858413: thread_noise: migration/8:54 start 5789.858409300 duration 3068 ns + osnoise/8-961 [008] .... 5789.858413: sample_threshold: start 5789.858404555 duration 8812 ns interferences 2 + +In this example, a noise sample of 8 microseconds was reported in the last +line, pointing to two interferences. Looking backward in the trace, the +two previous entries were about the migration thread running after a +timer IRQ execution. The first event is not part of the noise because +it took place one millisecond before. + +It is worth noticing that the sum of the duration reported in the +tracepoints is smaller than eight us reported in the sample_threshold. +The reason roots in the overhead of the entry and exit code that happens +before and after any interference execution. This justifies the dual +approach: measuring thread and tracing. diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0f66682ac02a..3e625c61f008 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c new file mode 100644 index 000000000000..6912672c33a7 --- /dev/null +++ b/arch/x86/kernel/trace.c @@ -0,0 +1,237 @@ +#include +#include + +#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) +extern void osnoise_trace_irq_entry(int id); +extern void osnoise_trace_irq_exit(int id, const char *desc); + +/* + * trace_intel_irq_entry - record intel specific IRQ entry + */ +static void trace_intel_irq_entry(void *data, int vector) +{ + osnoise_trace_irq_entry(vector); +} + +/* + * trace_intel_irq_exit - record intel specific IRQ exit + */ +static void trace_intel_irq_exit(void *data, int vector) +{ + char *vector_desc = (char *) data; + + osnoise_trace_irq_exit(vector, vector_desc); +} + +/* + * register_intel_irq_tp - Register intel specific IRQ entry tracepoints + */ +int osnoise_arch_register(void) +{ + int ret; + + ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_err; + + ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + if (ret) + goto out_timer_entry; + +#ifdef CONFIG_X86_THERMAL_VECTOR + ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_timer_exit; + + ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + if (ret) + goto out_thermal_entry; +#endif /* CONFIG_X86_THERMAL_VECTOR */ + +#ifdef CONFIG_X86_MCE_AMD + ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_thermal_exit; + + ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + if (ret) + goto out_deferred_entry; +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_deferred_exit; + + ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + if (ret) + goto out_threshold_entry; +#endif /* CONFIG_X86_MCE_THRESHOLD */ + +#ifdef CONFIG_SMP + ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_threshold_exit; + + ret = register_trace_call_function_single_exit(trace_intel_irq_exit, + "call_function_single"); + if (ret) + goto out_call_function_single_entry; + + ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_single_exit; + + ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + if (ret) + goto out_call_function_entry; + + ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_call_function_exit; + + ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + if (ret) + goto out_reschedule_entry; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_IRQ_WORK + ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_reschedule_exit; + + ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + if (ret) + goto out_irq_work_entry; +#endif + + ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_irq_work_exit; + + ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + if (ret) + goto out_x86_ipi_entry; + + ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_x86_ipi_exit; + + ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + if (ret) + goto out_error_apic_entry; + + ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + if (ret) + goto out_error_apic_exit; + + ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + if (ret) + goto out_spurious_apic_entry; + + return 0; + +out_spurious_apic_entry: + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); +out_error_apic_exit: + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); +out_error_apic_entry: + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); +out_x86_ipi_exit: + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); +out_x86_ipi_entry: + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); +out_irq_work_exit: + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); +out_irq_work_entry: + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +out_reschedule_exit: +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); +out_reschedule_entry: + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); +out_call_function_exit: + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); +out_call_function_entry: + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); +out_call_function_single_exit: + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); +out_call_function_single_entry: + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +out_threshold_exit: +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); +out_threshold_entry: + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +out_deferred_exit: +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); +out_deferred_entry: + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +out_thermal_exit: +#endif /* CONFIG_X86_MCE_AMD */ + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); +out_thermal_entry: + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +out_timer_exit: +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); +out_timer_entry: + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +out_err: + return -EINVAL; +} + +void osnoise_arch_unregister(void) +{ + unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic"); + unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic"); + unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL); + unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi"); + unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL); + +#ifdef CONFIG_IRQ_WORK + unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work"); + unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_SMP + unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule"); + unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function"); + unregister_trace_call_function_entry(trace_intel_irq_entry, NULL); + unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single"); + unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD + unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic"); + unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_MCE_AMD + unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error"); + unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL); +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR + unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic"); + unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL); +#endif /* CONFIG_X86_THERMAL_VECTOR */ + + unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); + unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); +} +#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0abd9a1d2852..f6faa31289ba 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -7,12 +7,21 @@ extern bool trace_hwlat_callback_enabled; extern void trace_hwlat_callback(bool enter); #endif +#ifdef CONFIG_OSNOISE_TRACER +extern bool trace_osnoise_callback_enabled; +extern void trace_osnoise_callback(bool enter); +#endif + static inline void ftrace_nmi_enter(void) { #ifdef CONFIG_HWLAT_TRACER if (trace_hwlat_callback_enabled) trace_hwlat_callback(true); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(true); +#endif } static inline void ftrace_nmi_exit(void) @@ -21,6 +30,10 @@ static inline void ftrace_nmi_exit(void) if (trace_hwlat_callback_enabled) trace_hwlat_callback(false); #endif +#ifdef CONFIG_OSNOISE_TRACER + if (trace_osnoise_callback_enabled) + trace_osnoise_callback(false); +#endif } #endif /* _LINUX_FTRACE_IRQ_H */ diff --git a/include/linux/trace.h b/include/linux/trace.h index be1e130ed87c..4e3858640c47 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -41,6 +41,11 @@ int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); struct trace_array *trace_array_get_by_name(const char *name); int trace_array_destroy(struct trace_array *tr); + +/* For osnoise tracer */ +int osnoise_arch_register(void); +void osnoise_arch_unregister(void); + #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_H */ diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h new file mode 100644 index 000000000000..28762c69f6c9 --- /dev/null +++ b/include/trace/events/osnoise.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM osnoise + +#if !defined(_OSNOISE_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _OSNOISE_TRACE_H + +#include +TRACE_EVENT(thread_noise, + + TP_PROTO(struct task_struct *t, u64 start, u64 duration), + + TP_ARGS(t, start, duration), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN) + __field( u64, start ) + __field( u64, duration) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + __entry->comm, + __entry->pid, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(softirq_noise, + + TP_PROTO(int vector, u64 start, u64 duration), + + TP_ARGS(vector, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%8s:%d start %llu.%09u duration %llu ns", + show_softirq_name(__entry->vector), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(irq_noise, + + TP_PROTO(int vector, const char *desc, u64 start, u64 duration), + + TP_ARGS(vector, desc, start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __string( desc, desc ) + __field( int, vector ) + + ), + + TP_fast_assign( + __assign_str(desc, desc); + __entry->vector = vector; + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("%s:%d start %llu.%09u duration %llu ns", + __get_str(desc), + __entry->vector, + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(nmi_noise, + + TP_PROTO(u64 start, u64 duration), + + TP_ARGS(start, duration), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + ), + + TP_printk("start %llu.%09u duration %llu ns", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration) +); + +TRACE_EVENT(sample_threshold, + + TP_PROTO(u64 start, u64 duration, u64 interference), + + TP_ARGS(start, duration, interference), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, duration) + __field( u64, interference) + ), + + TP_fast_assign( + __entry->start = start; + __entry->duration = duration; + __entry->interference = interference; + ), + + TP_printk("start %llu.%09u duration %llu ns interferences %llu", + __print_ns_to_secs(__entry->start), + __print_ns_without_secs(__entry->start), + __entry->duration, + __entry->interference) +); + +#endif /* _TRACE_OSNOISE_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7fa82778c3e6..41582ae4682b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -356,6 +356,40 @@ config HWLAT_TRACER file. Every time a latency is greater than tracing_thresh, it will be recorded into the ring buffer. +config OSNOISE_TRACER + bool "OS Noise tracer" + select GENERIC_TRACER + help + In the context of high-performance computing (HPC), the Operating + System Noise (osnoise) refers to the interference experienced by an + application due to activities inside the operating system. In the + context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread + can cause noise to the system. Moreover, hardware-related jobs can + also cause noise, for example, via SMIs. + + The osnoise tracer leverages the hwlat_detector by running a similar + loop with preemption, SoftIRQs and IRQs enabled, thus allowing all + the sources of osnoise during its execution. The osnoise tracer takes + note of the entry and exit point of any source of interferences, + increasing a per-cpu interference counter. It saves an interference + counter for each source of interference. The interference counter for + NMI, IRQs, SoftIRQs, and threads is increased anytime the tool + observes these interferences' entry events. When a noise happens + without any interference from the operating system level, the + hardware noise counter increases, pointing to a hardware-related + noise. In this way, osnoise can account for any source of + interference. At the end of the period, the osnoise tracer prints + the sum of all noise, the max single noise, the percentage of CPU + available for the thread, and the counters for the noise sources. + + In addition to the tracer, a set of tracepoints were added to + facilitate the identification of the osnoise source. + + The output will appear in the trace and trace_pipe files. + + To enable this tracer, echo in "osnoise" into the current_tracer + file. + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b28d3e5013cd..b1c47ccf4f73 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o +obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 87588d1e24ca..b959c9ec9711 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -44,6 +44,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_OSNOISE, TRACE_RAW_DATA, TRACE_FUNC_REPEATS, @@ -297,7 +298,8 @@ struct trace_array { struct array_buffer max_buffer; bool allocated_snapshot; #endif -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) +#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -445,6 +447,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ @@ -675,8 +678,8 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ -#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - defined(CONFIG_FSNOTIFY) +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER)) && defined(CONFIG_FSNOTIFY) #define LATENCY_FS_NOTIFY #endif diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 251c819cf0c5..158c0984b59b 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -360,3 +360,28 @@ FTRACE_ENTRY(func_repeats, func_repeats_entry, __entry->count, FUNC_REPEATS_GET_DELTA_TS(__entry)) ); + +FTRACE_ENTRY(osnoise, osnoise_entry, + + TRACE_OSNOISE, + + F_STRUCT( + __field( u64, noise ) + __field( u64, runtime ) + __field( u64, max_sample ) + __field( unsigned int, hw_count ) + __field( unsigned int, nmi_count ) + __field( unsigned int, irq_count ) + __field( unsigned int, softirq_count ) + __field( unsigned int, thread_count ) + ), + + F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n", + __entry->noise, + __entry->max_sample, + __entry->hw_count, + __entry->nmi_count, + __entry->irq_count, + __entry->softirq_count, + __entry->thread_count) +); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c new file mode 100644 index 000000000000..4e2c47dc4f19 --- /dev/null +++ b/kernel/trace/trace_osnoise.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * + * Based on "hwlat_detector" tracer by: + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. + * With feedback from Clark Williams + * + * And also based on the rtsl tracer presented on: + * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux + * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems + * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020. + * + * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#endif /* CONFIG_X86_LOCAL_APIC */ + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +static struct trace_array *osnoise_trace; + +/* + * Default values. + */ +#define BANNER "osnoise: " +#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ +#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */ + +/* + * NMI runtime info. + */ +struct osn_nmi { + u64 count; + u64 delta_start; +}; + +/* + * IRQ runtime info. + */ +struct osn_irq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * sofirq runtime info. + */ +struct osn_softirq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * thread runtime info. + */ +struct osn_thread { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * Runtime information: this structure saves the runtime information used by + * one sampling thread. + */ +struct osnoise_variables { + struct task_struct *kthread; + bool sampling; + pid_t pid; + struct osn_nmi nmi; + struct osn_irq irq; + struct osn_softirq softirq; + struct osn_thread thread; + local_t int_counter; +}; + +/* + * Per-cpu runtime information. + */ +DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); + +/* + * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU + */ +static inline struct osnoise_variables *this_cpu_osn_var(void) +{ + return this_cpu_ptr(&per_cpu_osnoise_var); +} + +/* + * osn_var_reset - Reset the values of the given osnoise_variables + */ +static inline void osn_var_reset(struct osnoise_variables *osn_var) +{ + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + memset(osn_var, 0, sizeof(*osn_var)); +} + +/* + * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + */ +static inline void osn_var_reset_all(void) +{ + struct osnoise_variables *osn_var; + int cpu; + + for_each_cpu(cpu, cpu_online_mask) { + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + osn_var_reset(osn_var); + } +} + +/* + * Tells NMIs to call back to the osnoise tracer to record timestamps. + */ +bool trace_osnoise_callback_enabled; + +/* + * osnoise sample structure definition. Used to store the statistics of a + * sample run. + */ +struct osnoise_sample { + u64 runtime; /* runtime */ + u64 noise; /* noise */ + u64 max_sample; /* max single noise sample */ + int hw_count; /* # HW (incl. hypervisor) interference */ + int nmi_count; /* # NMIs during this sample */ + int irq_count; /* # IRQs during this sample */ + int softirq_count; /* # softirqs during this sample */ + int thread_count; /* # threads during this sample */ +}; + +/* + * Protect the interface. + */ +struct mutex interface_lock; + +/* + * Tracer data. + */ +static struct osnoise_data { + u64 sample_period; /* total sampling period */ + u64 sample_runtime; /* active sampling portion of period */ + u64 stop_tracing; /* stop trace in the inside operation (loop) */ + u64 stop_tracing_total; /* stop trace in the outside operation (report) */ + bool tainted; /* infor users and developers about a problem */ +} osnoise_data = { + .sample_period = DEFAULT_SAMPLE_PERIOD, + .sample_runtime = DEFAULT_SAMPLE_RUNTIME, + .stop_tracing = 0, + .stop_tracing_total = 0, +}; + +/* + * Boolean variable used to inform that the tracer is currently sampling. + */ +static bool osnoise_busy; + +/* + * Print the osnoise header info. + */ +static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# || / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | |||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} + +/* + * osnoise_taint - report an osnoise error. + */ +#define osnoise_taint(msg) ({ \ + struct trace_array *tr = osnoise_trace; \ + \ + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ + osnoise_data.tainted = true; \ +}) + +/* + * Record an osnoise_sample into the tracer buffer. + */ +static void trace_osnoise_sample(struct osnoise_sample *sample) +{ + struct trace_array *tr = osnoise_trace; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct osnoise_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->runtime = sample->runtime; + entry->noise = sample->noise; + entry->max_sample = sample->max_sample; + entry->hw_count = sample->hw_count; + entry->nmi_count = sample->nmi_count; + entry->irq_count = sample->irq_count; + entry->softirq_count = sample->softirq_count; + entry->thread_count = sample->thread_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * Macros to encapsulate the time capturing infrastructure. + */ +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) + +/* + * cond_move_irq_delta_start - Forward the delta_start of a running IRQ + * + * If an IRQ is preempted by an NMI, its delta_start is pushed forward + * to discount the NMI interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->irq.delta_start) + osn_var->irq.delta_start += duration; +} + +#ifndef CONFIG_PREEMPT_RT +/* + * cond_move_softirq_delta_start - Forward the delta_start of a running softirq. + * + * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed + * forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->softirq.delta_start) + osn_var->softirq.delta_start += duration; +} +#else /* CONFIG_PREEMPT_RT */ +#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0) +#endif + +/* + * cond_move_thread_delta_start - Forward the delta_start of a running thread + * + * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start + * is pushed forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->thread.delta_start) + osn_var->thread.delta_start += duration; +} + +/* + * get_int_safe_duration - Get the duration of a window + * + * The irq, softirq and thread varaibles need to have its duration without + * the interference from higher priority interrupts. Instead of keeping a + * variable to discount the interrupt interference from these variables, the + * starting time of these variables are pushed forward with the interrupt's + * duration. In this way, a single variable is used to: + * + * - Know if a given window is being measured. + * - Account its duration. + * - Discount the interference. + * + * To avoid getting inconsistent values, e.g.,: + * + * now = time_get() + * ---> interrupt! + * delta_start -= int duration; + * <--- + * duration = now - delta_start; + * + * result: negative duration if the variable duration before the + * interrupt was smaller than the interrupt execution. + * + * A counter of interrupts is used. If the counter increased, try + * to capture an interference safe duration. + */ +static inline s64 +get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start) +{ + u64 int_counter, now; + s64 duration; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + now = time_get(); + duration = (now - *delta_start); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + /* + * This is an evidence of race conditions that cause + * a value to be "discounted" too much. + */ + if (duration < 0) + osnoise_taint("Negative duration!\n"); + + *delta_start = 0; + + return duration; +} + +/* + * + * set_int_safe_time - Save the current time on *time, aware of interference + * + * Get the time, taking into consideration a possible interference from + * higher priority interrupts. + * + * See get_int_safe_duration() for an explanation. + */ +static u64 +set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *time = time_get(); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} + +/* + * trace_osnoise_callback - NMI entry/exit callback + * + * This function is called at the entry and exit NMI code. The bool enter + * distinguishes between either case. This function is used to note a NMI + * occurrence, compute the noise caused by the NMI, and to remove the noise + * it is potentially causing on other interference variables. + */ +void trace_osnoise_callback(bool enter) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 duration; + + if (!osn_var->sampling) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) { + osn_var->nmi.delta_start = time_get(); + local_inc(&osn_var->int_counter); + } else { + duration = time_get() - osn_var->nmi.delta_start; + + trace_nmi_noise(osn_var->nmi.delta_start, duration); + + cond_move_irq_delta_start(osn_var, duration); + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); + } + } + + if (enter) + osn_var->nmi.count++; +} + +/* + * osnoise_trace_irq_entry - Note the starting of an IRQ + * + * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs, + * it is safe to use a single variable (ons_var->irq) to save the statistics. + * The arrival_time is used to report... the arrival time. The delta_start + * is used to compute the duration at the IRQ exit handler. See + * cond_move_irq_delta_start(). + */ +void osnoise_trace_irq_entry(int id) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->irq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->irq.delta_start); + osn_var->irq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * osnoise_irq_exit - Note the end of an IRQ, sava data and trace + * + * Computes the duration of the IRQ noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void osnoise_trace_irq_exit(int id, const char *desc) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start); + trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration); + osn_var->irq.arrival_time = 0; + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); +} + +/* + * trace_irqentry_callback - Callback to the irq:irq_entry traceevent + * + * Used to note the starting of an IRQ occurece. + */ +static void trace_irqentry_callback(void *data, int irq, + struct irqaction *action) +{ + osnoise_trace_irq_entry(irq); +} + +/* + * trace_irqexit_callback - Callback to the irq:irq_exit traceevent + * + * Used to note the end of an IRQ occurece. + */ +static void trace_irqexit_callback(void *data, int irq, + struct irqaction *action, int ret) +{ + osnoise_trace_irq_exit(irq, action->name); +} + +/* + * arch specific register function. + */ +int __weak osnoise_arch_register(void) +{ + return 0; +} + +/* + * arch specific unregister function. + */ +void __weak osnoise_arch_unregister(void) +{ + return; +} + +/* + * hook_irq_events - Hook IRQ handling events + * + * This function hooks the IRQ related callbacks to the respective trace + * events. + */ +int hook_irq_events(void) +{ + int ret; + + ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL); + if (ret) + goto out_unregister_entry; + + ret = osnoise_arch_register(); + if (ret) + goto out_irq_exit; + + return 0; + +out_irq_exit: + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); +out_unregister_entry: + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_irq_events - Unhook IRQ handling events + * + * This function unhooks the IRQ related callbacks to the respective trace + * events. + */ +void unhook_irq_events(void) +{ + osnoise_arch_unregister(); + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * trace_softirq_entry_callback - Note the starting of a softirq + * + * Save the starting time of a softirq. As softirqs are non-preemptive to + * other softirqs, it is safe to use a single variable (ons_var->softirq) + * to save the statistics. The arrival_time is used to report... the + * arrival time. The delta_start is used to compute the duration at the + * softirq exit handler. See cond_move_softirq_delta_start(). + */ +void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->softirq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->softirq.delta_start); + osn_var->softirq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * trace_softirq_exit_callback - Note the end of an softirq + * + * Computes the duration of the softirq noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); + trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); + cond_move_thread_delta_start(osn_var, duration); + osn_var->softirq.arrival_time = 0; +} + +/* + * hook_softirq_events - Hook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static int hook_softirq_events(void) +{ + int ret; + + ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL); + if (ret) + goto out_unreg_entry; + + return 0; + +out_unreg_entry: + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_softirq_events - Unhook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static void unhook_softirq_events(void) +{ + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); + unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL); +} +#else /* CONFIG_PREEMPT_RT */ +/* + * softirq are threads on the PREEMPT_RT mode. + */ +static int hook_softirq_events(void) +{ + return 0; +} +static void unhook_softirq_events(void) +{ +} +#endif + +/* + * thread_entry - Record the starting of a thread noise window + * + * It saves the context switch time for a noisy thread, and increments + * the interference counters. + */ +static void +thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) +{ + if (!osn_var->sampling) + return; + /* + * The arrival time will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->thread.arrival_time = time_get(); + + set_int_safe_time(osn_var, &osn_var->thread.delta_start); + + osn_var->thread.count++; + local_inc(&osn_var->int_counter); +} + +/* + * thread_exit - Report the end of a thread noise window + * + * It computes the total noise from a thread, tracing if needed. + */ +static void +thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) +{ + int duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start); + + trace_thread_noise(t, osn_var->thread.arrival_time, duration); + + osn_var->thread.arrival_time = 0; +} + +/* + * trace_sched_switch - sched:sched_switch trace event handler + * + * This function is hooked to the sched:sched_switch trace event, and it is + * used to record the beginning and to report the end of a thread noise window. + */ +void +trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, + struct task_struct *n) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (p->pid != osn_var->pid) + thread_exit(osn_var, p); + + if (n->pid != osn_var->pid) + thread_entry(osn_var, n); +} + +/* + * hook_thread_events - Hook the insturmentation for thread noise + * + * Hook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +int hook_thread_events(void) +{ + int ret; + + ret = register_trace_sched_switch(trace_sched_switch_callback, NULL); + if (ret) + return -EINVAL; + + return 0; +} + +/* + * unhook_thread_events - *nhook the insturmentation for thread noise + * + * Unook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +void unhook_thread_events(void) +{ + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); +} + +/* + * save_osn_sample_stats - Save the osnoise_sample statistics + * + * Save the osnoise_sample statistics before the sampling phase. These + * values will be used later to compute the diff betwneen the statistics + * before and after the osnoise sampling. + */ +void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count; + s->irq_count = osn_var->irq.count; + s->softirq_count = osn_var->softirq.count; + s->thread_count = osn_var->thread.count; +} + +/* + * diff_osn_sample_stats - Compute the osnoise_sample statistics + * + * After a sample period, compute the difference on the osnoise_sample + * statistics. The struct osnoise_sample *s contains the statistics saved via + * save_osn_sample_stats() before the osnoise sampling. + */ +void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count - s->nmi_count; + s->irq_count = osn_var->irq.count - s->irq_count; + s->softirq_count = osn_var->softirq.count - s->softirq_count; + s->thread_count = osn_var->thread.count - s->thread_count; +} + +/* + * osnoise_stop_tracing - Stop tracing and the tracer. + */ +static void osnoise_stop_tracing(void) +{ + struct trace_array *tr = osnoise_trace; + tracer_tracing_off(tr); +} + +/* + * run_osnoise - Sample the time and look for osnoise + * + * Used to capture the time, looking for potential osnoise latency repeatedly. + * Different from hwlat_detector, it is called with preemption and interrupts + * enabled. This allows irqs, softirqs and threads to run, interfering on the + * osnoise sampling thread, as they would do with a regular thread. + */ +static int run_osnoise(void) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 noise = 0, sum_noise = 0, max_noise = 0; + struct trace_array *tr = osnoise_trace; + u64 start, sample, last_sample; + u64 last_int_count, int_count; + s64 total, last_total = 0; + struct osnoise_sample s; + unsigned int threshold; + int hw_count = 0; + u64 runtime, stop_in; + int ret = -1; + + /* + * Considers the current thread as the workload. + */ + osn_var->pid = current->pid; + + /* + * Save the current stats for the diff + */ + save_osn_sample_stats(osn_var, &s); + + /* + * if threshold is 0, use the default value of 5 us. + */ + threshold = tracing_thresh ? : 5000; + + /* + * Make sure NMIs see sampling first + */ + osn_var->sampling = true; + barrier(); + + /* + * Transform the *_us config to nanoseconds to avoid the + * division on the main loop. + */ + runtime = osnoise_data.sample_runtime * NSEC_PER_USEC; + stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; + + /* + * Start timestemp + */ + start = time_get(); + + /* + * "previous" loop. + */ + last_int_count = set_int_safe_time(osn_var, &last_sample); + + do { + /* + * Get sample! + */ + int_count = set_int_safe_time(osn_var, &sample); + + noise = time_sub(sample, last_sample); + + /* + * This shouldn't happen. + */ + if (noise < 0) { + osnoise_taint("negative noise!"); + goto out; + } + + /* + * Sample runtime. + */ + total = time_sub(sample, start); + + /* + * Check for possible overflows. + */ + if (total < last_total) { + osnoise_taint("total overflow!"); + break; + } + + last_total = total; + + if (noise >= threshold) { + int interference = int_count - last_int_count; + + if (noise > max_noise) + max_noise = noise; + + if (!interference) + hw_count++; + + sum_noise += noise; + + trace_sample_threshold(last_sample, noise, interference); + + if (osnoise_data.stop_tracing) + if (noise > stop_in) + osnoise_stop_tracing(); + } + + /* + * For the non-preemptive kernel config: let threads runs, if + * they so wish. + */ + cond_resched(); + + last_sample = sample; + last_int_count = int_count; + + } while (total < runtime && !kthread_should_stop()); + + /* + * Finish the above in the view for interrupts. + */ + barrier(); + + osn_var->sampling = false; + + /* + * Make sure sampling data is no longer updated. + */ + barrier(); + + /* + * Save noise info. + */ + s.noise = time_to_us(sum_noise); + s.runtime = time_to_us(total); + s.max_sample = time_to_us(max_noise); + s.hw_count = hw_count; + + /* Save interference stats info */ + diff_osn_sample_stats(osn_var, &s); + + trace_osnoise_sample(&s); + + /* Keep a running maximum ever recorded osnoise "latency" */ + if (max_noise > tr->max_latency) { + tr->max_latency = max_noise; + latency_fsnotify(tr); + } + + if (osnoise_data.stop_tracing_total) + if (s.noise > osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + return 0; +out: + return ret; +} + +static struct cpumask osnoise_cpumask; +static struct cpumask save_cpumask; + +/* + * osnoise_main - The osnoise detection kernel thread + * + * Calls run_osnoise() function to measure the osnoise for the configured runtime, + * every period. + */ +static int osnoise_main(void *data) +{ + s64 interval; + + while (!kthread_should_stop()) { + + run_osnoise(); + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + do_div(interval, USEC_PER_MSEC); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (interval < 1) + continue; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/* + * stop_per_cpu_kthread - stop per-cpu threads + * + * Stop the osnoise sampling htread. Use this on unload and at system + * shutdown. + */ +static void stop_per_cpu_kthreads(void) +{ + struct task_struct *kthread; + int cpu; + + for_each_online_cpu(cpu) { + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } +} + +/* + * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads + * + * This starts the kernel thread that will look for osnoise on many + * cpus. + */ +static int start_per_cpu_kthreads(struct trace_array *tr) +{ + struct cpumask *current_mask = &save_cpumask; + struct task_struct *kthread; + char comm[24]; + int cpu; + + get_online_cpus(); + /* + * Run only on CPUs in which trace and osnoise are allowed to run. + */ + cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask); + /* + * And the CPU is online. + */ + cpumask_and(current_mask, cpu_online_mask, current_mask); + put_online_cpus(); + + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + + for_each_cpu(cpu, current_mask) { + snprintf(comm, 24, "osnoise/%d", cpu); + + kthread = kthread_create_on_cpu(osnoise_main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + wake_up_process(kthread); + } + + return 0; +} + +/* + * osnoise_cpus_read - Read function for reading the "cpus" file + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints the "cpus" output into the user-provided buffer. + */ +static ssize_t +osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, + loff_t *ppos) +{ + char *mask_str; + int len; + + mutex_lock(&interface_lock); + + len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) { + count = -ENOMEM; + goto out_unlock; + } + + len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_free; + } + + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); + +out_free: + kfree(mask_str); +out_unlock: + mutex_unlock(&interface_lock); + + return count; +} + +static void osnoise_tracer_start(struct trace_array *tr); +static void osnoise_tracer_stop(struct trace_array *tr); + +/* + * osnoise_cpus_write - Write function for "cpus" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "cpus" + * interface to the osnoise trace. By default, it lists all CPUs, + * in this way, allowing osnoise threads to run on any online CPU + * of the system. It serves to restrict the execution of osnoise to the + * set of CPUs writing via this interface. Note that osnoise also + * respects the "tracing_cpumask." Hence, osnoise threads will run only + * on the set of CPUs allowed here AND on "tracing_cpumask." Why not + * have just "tracing_cpumask?" Because the user might be interested + * in tracing what is running on other CPUs. For instance, one might + * run osnoise in one HT CPU while observing what is running on the + * sibling HT CPU. + */ +static ssize_t +osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, + loff_t *ppos) +{ + struct trace_array *tr = osnoise_trace; + cpumask_var_t osnoise_cpumask_new; + int running, err; + char buf[256]; + + if (count >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + + if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpulist_parse(buf, osnoise_cpumask_new); + if (err) + goto err_free; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop + * and osnoise_busy. + */ + mutex_lock(&trace_types_lock); + running = osnoise_busy; + if (running) + osnoise_tracer_stop(tr); + + mutex_lock(&interface_lock); + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + mutex_unlock(&interface_lock); + + if (running) + osnoise_tracer_start(tr); + mutex_unlock(&trace_types_lock); + + free_cpumask_var(osnoise_cpumask_new); + return count; + +err_free: + free_cpumask_var(osnoise_cpumask_new); + + return err; +} + +/* + * osnoise/runtime_us: cannot be greater than the period. + */ +static struct trace_min_max_param osnoise_runtime = { + .lock = &interface_lock, + .val = &osnoise_data.sample_runtime, + .max = &osnoise_data.sample_period, + .min = NULL, +}; + +/* + * osnoise/period_us: cannot be smaller than the runtime. + */ +static struct trace_min_max_param osnoise_period = { + .lock = &interface_lock, + .val = &osnoise_data.sample_period, + .max = NULL, + .min = &osnoise_data.sample_runtime, +}; + +/* + * osnoise/stop_tracing_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_in = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/stop_tracing_total_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_total = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing_total, + .max = NULL, + .min = NULL, +}; + +static const struct file_operations cpus_fops = { + .open = tracing_open_generic, + .read = osnoise_cpus_read, + .write = osnoise_cpus_write, + .llseek = generic_file_llseek, +}; + +/* + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "osnoise". It creates the + * "osnoise" directory in the tracing directory, and within that + * directory is the count, runtime and period files to change and view + * those values. + */ +static int init_tracefs(void) +{ + struct dentry *top_dir; + struct dentry *tmp; + int ret; + + ret = tracing_init_dentry(); + if (ret) + return -ENOMEM; + + top_dir = tracefs_create_dir("osnoise", NULL); + if (!top_dir) + return -ENOMEM; + + tmp = tracefs_create_file("period_us", 0640, top_dir, + &osnoise_period, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("runtime_us", 0644, top_dir, + &osnoise_runtime, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + &osnoise_stop_tracing_in, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + &osnoise_stop_tracing_total, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + if (!tmp) + goto err; + + return 0; + +err: + tracefs_remove(top_dir); + return -ENOMEM; +} + +static int osnoise_hook_events(void) +{ + int retval; + + /* + * Trace is already hooked, we are re-enabling from + * a stop_tracing_*. + */ + if (trace_osnoise_callback_enabled) + return 0; + + retval = hook_irq_events(); + if (retval) + return -EINVAL; + + retval = hook_softirq_events(); + if (retval) + goto out_unhook_irq; + + retval = hook_thread_events(); + /* + * All fine! + */ + if (!retval) + return 0; + + unhook_softirq_events(); +out_unhook_irq: + unhook_irq_events(); + return -EINVAL; +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + if (osnoise_busy) + return; + + osn_var_reset_all(); + + retval = osnoise_hook_events(); + if (retval) + goto out_err; + /* + * Make sure NMIs see reseted values. + */ + barrier(); + trace_osnoise_callback_enabled = true; + + retval = start_per_cpu_kthreads(tr); + /* + * all fine! + */ + if (!retval) + return; + +out_err: + unhook_irq_events(); + pr_err(BANNER "Error starting osnoise tracer\n"); +} + +static void osnoise_tracer_stop(struct trace_array *tr) +{ + if (!osnoise_busy) + return; + + trace_osnoise_callback_enabled = false; + barrier(); + + stop_per_cpu_kthreads(); + + unhook_irq_events(); + unhook_softirq_events(); + unhook_thread_events(); + + osnoise_busy = false; +} + +static int osnoise_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (osnoise_busy) + return -EBUSY; + + osnoise_trace = tr; + + tr->max_latency = 0; + + osnoise_tracer_start(tr); + + osnoise_busy = true; + + return 0; +} + +static void osnoise_tracer_reset(struct trace_array *tr) +{ + osnoise_tracer_stop(tr); +} + +static struct tracer osnoise_tracer __read_mostly = { + .name = "osnoise", + .init = osnoise_tracer_init, + .reset = osnoise_tracer_reset, + .start = osnoise_tracer_start, + .stop = osnoise_tracer_stop, + .print_header = print_osnoise_headers, + .allow_instances = true, +}; + +__init static int init_osnoise_tracer(void) +{ + int ret; + + mutex_init(&interface_lock); + + cpumask_copy(&osnoise_cpumask, cpu_all_mask); + + ret = register_tracer(&osnoise_tracer); + if (ret) + return ret; + + init_tracefs(); + + return 0; +} +late_initcall(init_osnoise_tracer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d0368a569bfa..642b6584eba5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1202,7 +1202,6 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, return trace_handle_return(s); } - static enum print_line_t trace_hwlat_raw(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -1232,6 +1231,76 @@ static struct trace_event trace_hwlat_event = { .funcs = &trace_hwlat_funcs, }; +/* TRACE_OSNOISE */ +static enum print_line_t +trace_osnoise_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct osnoise_entry *field; + u64 ratio, ratio_dec; + u64 net_runtime; + + trace_assign_type(field, entry); + + /* + * compute the available % of cpu time. + */ + net_runtime = field->runtime - field->noise; + ratio = net_runtime * 10000000; + do_div(ratio, field->runtime); + ratio_dec = do_div(ratio, 100000); + + trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu", + field->runtime, + field->noise, + ratio, ratio_dec, + field->max_sample); + + trace_seq_printf(s, " %6u", field->hw_count); + trace_seq_printf(s, " %6u", field->nmi_count); + trace_seq_printf(s, " %6u", field->irq_count); + trace_seq_printf(s, " %6u", field->softirq_count); + trace_seq_printf(s, " %6u", field->thread_count); + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_osnoise_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct osnoise_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n", + field->runtime, + field->noise, + field->max_sample, + field->hw_count, + field->nmi_count, + field->irq_count, + field->softirq_count, + field->thread_count); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_osnoise_funcs = { + .trace = trace_osnoise_print, + .raw = trace_osnoise_raw, +}; + +static struct trace_event trace_osnoise_event = { + .type = TRACE_OSNOISE, + .funcs = &trace_osnoise_funcs, +}; + /* TRACE_BPUTS */ static enum print_line_t trace_bputs_print(struct trace_iterator *iter, int flags, @@ -1442,6 +1511,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_osnoise_event, &trace_raw_data_event, &trace_func_repeats_event, NULL -- cgit v1.2.3 From 328aac5ecd119ede3633f7d17969b1ff34ccc784 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 22 Jun 2021 16:30:26 +0530 Subject: bpf, x86: Fix extable offset calculation Commit 4c5de127598e1 ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.") is emitting a couple of instructions before the actual load. Consider those additional instructions while calculating extable offset. Fixes: 4c5de127598e1 ("bpf: Emit explicit NULL pointer checks for PROBE_LDX instructions.") Signed-off-by: Ravi Bangoria Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210622110026.1157847-1-ravi.bangoria@linux.ibm.com --- arch/x86/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index db1e83813db5..e835164189f1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1281,7 +1281,7 @@ st: if (is_imm8(insn->off)) emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { struct exception_table_entry *ex; - u8 *_insn = image + proglen; + u8 *_insn = image + proglen + (start_of_ldx - temp); s64 delta; /* populate jmp_offset for JMP above */ -- cgit v1.2.3 From f7d9f6370e006400655ff96cb148f56598492d91 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 28 Jun 2021 11:45:47 +0200 Subject: trace/osnoise: Fix 'no previous prototype' warnings kernel test robot reported some osnoise functions with "no previous prototype." Fix these warnings by making local functions static, and by adding: void osnoise_trace_irq_entry(int id); void osnoise_trace_irq_exit(int id, const char *desc); to include/linux/trace.h. Link: https://lkml.kernel.org/r/e40d3cb4be8bde921f4b40fa6a095cf85ab807bd.1624872608.git.bristot@redhat.com Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Reported-by: kernel test robot Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/trace.c | 3 --- include/linux/trace.h | 2 ++ kernel/trace/trace_osnoise.c | 20 +++++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index 6912672c33a7..6b73b6f92ad3 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -2,9 +2,6 @@ #include #if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC) -extern void osnoise_trace_irq_entry(int id); -extern void osnoise_trace_irq_exit(int id, const char *desc); - /* * trace_intel_irq_entry - record intel specific IRQ entry */ diff --git a/include/linux/trace.h b/include/linux/trace.h index 4e3858640c47..bf169612ffe1 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -45,6 +45,8 @@ int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ int osnoise_arch_register(void); void osnoise_arch_unregister(void); +void osnoise_trace_irq_entry(int id); +void osnoise_trace_irq_exit(int id, const char *desc); #endif /* CONFIG_TRACING */ diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 556d530af805..9c3109e3ffeb 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -736,7 +736,7 @@ void __weak osnoise_arch_unregister(void) * This function hooks the IRQ related callbacks to the respective trace * events. */ -int hook_irq_events(void) +static int hook_irq_events(void) { int ret; @@ -768,7 +768,7 @@ out_err: * This function unhooks the IRQ related callbacks to the respective trace * events. */ -void unhook_irq_events(void) +static void unhook_irq_events(void) { osnoise_arch_unregister(); unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); @@ -785,7 +785,7 @@ void unhook_irq_events(void) * arrival time. The delta_start is used to compute the duration at the * softirq exit handler. See cond_move_softirq_delta_start(). */ -void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); @@ -808,7 +808,7 @@ void trace_softirq_entry_callback(void *data, unsigned int vec_nr) * Computes the duration of the softirq noise, and trace it. Also discounts the * interference from other sources of noise could be currently being accounted. */ -void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) { struct osnoise_variables *osn_var = this_cpu_osn_var(); int duration; @@ -949,7 +949,7 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) * This function is hooked to the sched:sched_switch trace event, and it is * used to record the beginning and to report the end of a thread noise window. */ -void +static void trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, struct task_struct *n) { @@ -968,7 +968,7 @@ trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p, * Hook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -int hook_thread_events(void) +static int hook_thread_events(void) { int ret; @@ -985,7 +985,7 @@ int hook_thread_events(void) * Unook the osnoise tracer callbacks to handle the noise from other * threads on the necessary kernel events. */ -void unhook_thread_events(void) +static void unhook_thread_events(void) { unregister_trace_sched_switch(trace_sched_switch_callback, NULL); } @@ -997,7 +997,8 @@ void unhook_thread_events(void) * values will be used later to compute the diff betwneen the statistics * before and after the osnoise sampling. */ -void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count; s->irq_count = osn_var->irq.count; @@ -1012,7 +1013,8 @@ void save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sam * statistics. The struct osnoise_sample *s contains the statistics saved via * save_osn_sample_stats() before the osnoise sampling. */ -void diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +static void +diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) { s->nmi_count = osn_var->nmi.count - s->nmi_count; s->irq_count = osn_var->irq.count - s->irq_count; -- cgit v1.2.3 From a4eec6a3dfb7a6257ddcacf15e9428fe5834ffd4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 28 Jun 2021 19:38:31 -0700 Subject: binfmt: remove in-tree usage of MAP_EXECUTABLE Ever since commit e9714acf8c43 ("mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas"), VM_EXECUTABLE is gone and MAP_EXECUTABLE is essentially completely ignored. Let's remove all usage of MAP_EXECUTABLE. [akpm@linux-foundation.org: fix blooper in fs/binfmt_aout.c. per David] Link: https://lkml.kernel.org/r/20210421093453.6904-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: "Eric W. Biederman" Reviewed-by: Kees Cook Cc: Alexander Shishkin Cc: Alexander Viro Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Catalin Marinas Cc: Don Zickus Cc: Feng Tang Cc: Greg Ungerer Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kevin Brodsky Cc: Mark Rutland Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 4 ++-- fs/binfmt_aout.c | 4 ++-- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 11 ++--------- fs/binfmt_flat.c | 2 +- 5 files changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index a09fc37ead9d..5e5b9fc2747f 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -203,7 +203,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset); if (error != N_TXTADDR(ex)) @@ -212,7 +212,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | - MAP_EXECUTABLE | MAP_32BIT, + MAP_32BIT, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 3e84e9bb9084..145917f734fe 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset); if (error != N_TXTADDR(ex)) @@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 187b3f2b9202..baf8f91776f4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1070,7 +1070,7 @@ out_free_interp: elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE; vaddr = elf_ppnt->p_vaddr; /* diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c99b102c860..39fa1b0307e1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; - unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; load_addr = params->load_addr; @@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } /* allocate one big anon block for everything */ - mflags = MAP_PRIVATE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - mflags |= MAP_EXECUTABLE; - maddr = vm_mmap(NULL, load_addr, top - base, - PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) prot |= PROT_EXEC; flags = MAP_PRIVATE | MAP_DENYWRITE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - flags |= MAP_EXECUTABLE; - maddr = 0; switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a1072c6a2341..5d776f80ee50 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm, pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, - MAP_PRIVATE|MAP_EXECUTABLE, 0); + MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { ret = textpos; if (!textpos) -- cgit v1.2.3 From 9ce2c3fc0be6e7d0bb2236a33bbb7a0f1943bd81 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 28 Jun 2021 19:39:14 -0700 Subject: x86/sgx: use vma_lookup() in sgx_encl_find() Use vma_lookup() to find the VMA at a specific address. As vma_lookup() will return NULL if the address is not within any VMA, the start address no longer needs to be validated. Link: https://lkml.kernel.org/r/20210521174745.2219620-10-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Laurent Dufour Acked-by: David Hildenbrand Acked-by: Davidlohr Bueso Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/sgx/encl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 6e74f85b6264..fec43ca65065 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *result; - result = find_vma(mm, addr); - if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start) + result = vma_lookup(mm, addr); + if (!result || result->vm_ops != &sgx_vm_ops) return -EINVAL; *vma = result; -- cgit v1.2.3 From a9ee6cf5c60ed1070e786e53665f9b2f23f2bd11 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 28 Jun 2021 19:43:01 -0700 Subject: mm: replace CONFIG_NEED_MULTIPLE_NODES with CONFIG_NUMA After removal of DISCINTIGMEM the NEED_MULTIPLE_NODES and NUMA configuration options are equivalent. Drop CONFIG_NEED_MULTIPLE_NODES and use CONFIG_NUMA instead. Done with $ sed -i 's/CONFIG_NEED_MULTIPLE_NODES/CONFIG_NUMA/' \ $(git grep -wl CONFIG_NEED_MULTIPLE_NODES) $ sed -i 's/NEED_MULTIPLE_NODES/NUMA/' \ $(git grep -wl NEED_MULTIPLE_NODES) with manual tweaks afterwards. [rppt@linux.ibm.com: fix arm boot crash] Link: https://lkml.kernel.org/r/YMj9vHhHOiCVN4BF@linux.ibm.com Link: https://lkml.kernel.org/r/20210608091316.3622-9-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Arnd Bergmann Acked-by: David Hildenbrand Cc: Geert Uytterhoeven Cc: Ivan Kokshaysky Cc: Jonathan Corbet Cc: Matt Turner Cc: Richard Henderson Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/ia64/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/mips/include/asm/mmzone.h | 2 +- arch/mips/include/asm/page.h | 2 +- arch/mips/mm/init.c | 4 ++-- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/mmzone.h | 4 ++-- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kexec/core.c | 4 ++-- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mem.c | 4 ++-- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/sh/include/asm/mmzone.h | 4 ++-- arch/sh/kernel/topology.c | 2 +- arch/sh/mm/Kconfig | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/Kconfig | 2 +- arch/sparc/include/asm/mmzone.h | 4 ++-- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/mm/init_64.c | 12 ++++++------ arch/x86/Kconfig | 2 +- arch/x86/kernel/setup_percpu.c | 6 +++--- arch/x86/mm/init_32.c | 4 ++-- include/asm-generic/topology.h | 2 +- include/linux/memblock.h | 6 +++--- include/linux/mm.h | 4 ++-- include/linux/mmzone.h | 6 +++--- kernel/crash_core.c | 2 +- mm/Kconfig | 9 --------- mm/memblock.c | 8 ++++---- mm/memory.c | 3 +-- mm/page_alloc.c | 6 +++--- mm/sparse.c | 2 +- 36 files changed, 59 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f1d8566bbf9..d01a1545ab8f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1035,7 +1035,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 279252e3e0f7..da22a35e6f03 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -302,7 +302,7 @@ config NODES_SHIFT int "Max num nodes shift(3-10)" range 3 10 default "10" - depends on NEED_MULTIPLE_NODES + depends on NUMA help This option specifies the maximum number of nodes in your SSI system. MAX_NUMNODES will be 2^(This value). diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ed51970c08e7..4704a16c2e44 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2867,7 +2867,7 @@ config RANDOMIZE_BASE_MAX_OFFSET config NODES_SHIFT int default "6" - depends on NEED_MULTIPLE_NODES + depends on NUMA config HW_PERF_EVENTS bool "Enable hardware performance counter support for perf events" diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 7649ab45e80c..602a21aee9d4 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA # include #endif diff --git a/arch/mips/include/asm/page.h b/arch/mips/include/asm/page.h index 195ff4e9771f..96bc798c1ec1 100644 --- a/arch/mips/include/asm/page.h +++ b/arch/mips/include/asm/page.h @@ -239,7 +239,7 @@ static inline int pfn_valid(unsigned long pfn) /* pfn_valid is defined in linux/mmzone.h */ -#elif defined(CONFIG_NEED_MULTIPLE_NODES) +#elif defined(CONFIG_NUMA) #define pfn_valid(pfn) \ ({ \ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 97f6ca341448..19347dc6bbf8 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -394,7 +394,7 @@ void maar_init(void) } } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -473,7 +473,7 @@ void __init mem_init(void) 0x80000000 - 4, KCORE_TEXT); #endif } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) { diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 088dd2afcfe4..14b132cf95e2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -671,7 +671,7 @@ config NODES_SHIFT int default "8" if PPC64 default "4" - depends on NEED_MULTIPLE_NODES + depends on NUMA config USE_PERCPU_NUMA_NODE_ID def_bool y diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 6cda76b57c5d..4c6c6dbd182f 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -18,7 +18,7 @@ * flags field of the struct page */ -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA extern struct pglist_data *node_data[]; /* @@ -41,7 +41,7 @@ u64 memory_hotplug_max(void); #else #define memory_hotplug_max() memblock_end_of_DRAM() -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #ifdef CONFIG_FA_DUMP #define __HAVE_ARCH_RESERVED_KERNEL_PAGES #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e42b85e4f1aa..a35fbf4d0bce 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -788,7 +788,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 2e05c783440a..a5209ea3859e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1047,7 +1047,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * numa_node_id() works after this. */ diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 56da5eb2b923..48525e8b5730 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -68,11 +68,11 @@ void machine_kexec_cleanup(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA VMCOREINFO_SYMBOL(node_data); VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(contig_page_data); #endif #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index c3df3a8501d4..2ffcf540f08b 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,7 +13,7 @@ obj-y := fault.o mem.o pgtable.o mmap.o maccess.o \ obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ obj-$(CONFIG_PPC_BOOK3S_64) += book3s64/ -obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index a6b36a40897a..c5e520c6f13b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -127,7 +127,7 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, } #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init mem_topology_setup(void) { max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; @@ -162,7 +162,7 @@ static int __init mark_nonram_nosave(void) return 0; } -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ static int __init mark_nonram_nosave(void) { return 0; diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 18ec0f9bb8d5..15f9490a7aad 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -332,7 +332,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 1 10 default "2" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b4c7c34069f8..707afbcd81c2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -475,7 +475,7 @@ config NUMA config NODES_SHIFT int - depends on NEED_MULTIPLE_NODES + depends on NUMA default "1" config SCHED_SMT diff --git a/arch/sh/include/asm/mmzone.h b/arch/sh/include/asm/mmzone.h index 6552a088dc97..7b8dead2723d 100644 --- a/arch/sh/include/asm/mmzone.h +++ b/arch/sh/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_MMZONE_H #define __ASM_SH_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include extern struct pglist_data *node_data[]; @@ -31,7 +31,7 @@ static inline void setup_bootmem_node(int nid, unsigned long start, unsigned long end) { } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Platform specific mem init */ void __init plat_mem_setup(void); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 7a989eed3b18..76af6db9daa2 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,7 +46,7 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for_each_online_node(i) register_one_node(i); #endif diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index d551a9cac41e..ba569cfb4368 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -120,7 +120,7 @@ config NODES_SHIFT int default "3" if CPU_SUBTYPE_SHX3 default "1" - depends on NEED_MULTIPLE_NODES + depends on NUMA config ARCH_FLATMEM_ENABLE def_bool y diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 168d7d4dd735..ce26c7f8950a 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -211,7 +211,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_try_nid( sizeof(struct pglist_data), SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 164a5254c91c..c72f52c704cd 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -265,7 +265,7 @@ config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" range 4 5 if SPARC64 default "5" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index 6543fb97a849..a236d8aa893a 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -2,7 +2,7 @@ #ifndef _SPARC64_MMZONE_H #define _SPARC64_MMZONE_H -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA #include @@ -13,6 +13,6 @@ extern struct pglist_data *node_data[]; extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ #endif /* _SPARC64_MMZONE_H */ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e38d8bf454e8..c89a5971fb0d 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1546,7 +1546,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = cpu_to_node(cpu); void *ptr; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index e454f179cf5d..06e938d03f3b 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -903,7 +903,7 @@ struct node_mem_mask { static struct node_mem_mask node_masks[MAX_NUMNODES]; static int num_node_masks; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct mdesc_mlgroup { u64 node; @@ -1059,7 +1059,7 @@ static void __init allocate_node_data(int nid) { struct pglist_data *p; unsigned long start_pfn, end_pfn; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA NODE_DATA(nid) = memblock_alloc_node(sizeof(struct pglist_data), SMP_CACHE_BYTES, nid); @@ -1080,7 +1080,7 @@ static void __init allocate_node_data(int nid) static void init_node_masks_nonnuma(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; #endif @@ -1090,7 +1090,7 @@ static void init_node_masks_nonnuma(void) node_masks[0].match = 0; num_node_masks = 1; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA for (i = 0; i < NR_CPUS; i++) numa_cpu_lookup_table[i] = 0; @@ -1098,7 +1098,7 @@ static void init_node_masks_nonnuma(void) #endif } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA struct pglist_data *node_data[MAX_NUMNODES]; EXPORT_SYMBOL(numa_cpu_lookup_table); @@ -2487,7 +2487,7 @@ int page_in_phys_avail(unsigned long paddr) static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int i; for_each_online_node(i) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0045e1b44190..5d523ff70fe7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1597,7 +1597,7 @@ config NODES_SHIFT default "10" if MAXSMP default "6" if X86_64 default "3" - depends on NEED_MULTIPLE_NODES + depends on NUMA help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0941d2f44f2a..78a32b956e81 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ static bool __init pcpu_need_numa(void) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA pg_data_t *last = NULL; unsigned int cpu; @@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int node = early_cpu_to_node(cpu); void *ptr; @@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size) static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 21ffb03f6c72..74b78840182d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -651,7 +651,7 @@ void __init find_low_pfn_range(void) highmem_pfn_init(); } -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA void __init initmem_init(void) { #ifdef CONFIG_HIGHMEM @@ -677,7 +677,7 @@ void __init initmem_init(void) setup_bootmem_allocator(); } -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ void __init setup_bootmem_allocator(void) { diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 5aa8705df87e..4dbe715be65b 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -45,7 +45,7 @@ #endif #ifndef cpumask_of_node - #ifdef CONFIG_NEED_MULTIPLE_NODES + #ifdef CONFIG_NUMA #define cpumask_of_node(node) ((node) == 0 ? cpu_online_mask : cpu_none_mask) #else #define cpumask_of_node(node) ((void)(node), cpu_online_mask) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 5984fff3f175..552309342c38 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int nid; #endif }; @@ -347,7 +347,7 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -366,7 +366,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9bd21e6fad6a..07922ee1477e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,7 +46,7 @@ extern int sysctl_page_lock_unfairness; void init_mm_internals(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ +#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) @@ -2460,7 +2460,7 @@ extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6f9829562af2..4bd420ed3961 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1043,17 +1043,17 @@ extern int percpu_pagelist_high_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map -#else /* CONFIG_NEED_MULTIPLE_NODES */ +#else /* CONFIG_NUMA */ #include -#endif /* !CONFIG_NEED_MULTIPLE_NODES */ +#endif /* !CONFIG_NUMA */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 684a6061a13a..0a4780c047c9 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 218b96ccc84a..bffe4bd859f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -59,15 +59,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -# -# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's -# to represent different areas of memory. This variable allows -# those dependencies to exist individually. -# -config NEED_MULTIPLE_NODES - def_bool y - depends on NUMA - # # SPARSEMEM_EXTREME (which is the default) does some bootmem # allocations when sparse_init() is called. If this cannot diff --git a/mm/memblock.c b/mm/memblock.c index afaefa8fc6ab..123feef5259d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -92,7 +92,7 @@ * system initialization completes. */ -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA struct pglist_data __refdata contig_page_data; EXPORT_SYMBOL(contig_page_data); #endif @@ -607,7 +607,7 @@ repeat: * area, insert that portion. */ if (rbase > base) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA WARN_ON(nid != memblock_get_region_node(rgn)); #endif WARN_ON(flags != rgn->flags); @@ -1205,7 +1205,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid) { -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA int start_rgn, end_rgn; int i, ret; @@ -1849,7 +1849,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) size = rgn->size; end = base + size - 1; flags = rgn->flags; -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA if (memblock_get_region_node(rgn) != MAX_NUMNODES) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); diff --git a/mm/memory.c b/mm/memory.c index 3dd6b2e73e1d..48c4576df898 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -90,8 +90,7 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NEED_MULTIPLE_NODES -/* use the per-pgdat data instead for discontigmem - mbligh */ +#ifndef CONFIG_NUMA unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8926f3fd3bcf..c4069f9e3968 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1634,7 +1634,7 @@ void __free_pages_core(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); } -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NUMA /* * During memory init memblocks map pfns to nids. The search is expensive and @@ -1684,7 +1684,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) return nid; } -#endif /* CONFIG_NEED_MULTIPLE_NODES */ +#endif /* CONFIG_NUMA */ void __init memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order) @@ -7438,7 +7438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA /* * With no DISCONTIG, the global mem_map is just set as node 0's */ diff --git a/mm/sparse.c b/mm/sparse.c index 55c18aff3e42..7272f7a1449d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -346,7 +346,7 @@ size_t mem_section_usage_size(void) static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) { -#ifndef CONFIG_NEED_MULTIPLE_NODES +#ifndef CONFIG_NUMA return __pa_symbol(pgdat); #else return __pa(pgdat); -- cgit v1.2.3 From a3f5d80ea401ac857f2910e28b15f35b2cf902f4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 28 Jun 2021 19:43:14 -0700 Subject: mm,hwpoison: send SIGBUS with error virutal address Now an action required MCE in already hwpoisoned address surely sends a SIGBUS to current process, but the SIGBUS doesn't convey error virtual address. That's not optimal for hwpoison-aware applications. To fix the issue, make memory_failure() call kill_accessing_process(), that does pagetable walk to find the error virtual address. It could find multiple virtual addresses for the same error page, and it seems hard to tell which virtual address is correct one. But that's rare and sending incorrect virtual address could be better than no address. So let's report the first found virtual address for now. [naoya.horiguchi@nec.com: fix walk_page_range() return] Link: https://lkml.kernel.org/r/20210603051055.GA244241@hori.linux.bs1.fc.nec.co.jp Link: https://lkml.kernel.org/r/20210521030156.2612074-4-nao.horiguchi@gmail.com Signed-off-by: Naoya Horiguchi Cc: Tony Luck Cc: Aili Yao Cc: Oscar Salvador Cc: David Hildenbrand Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Jue Wang Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mce/core.c | 13 +++- include/linux/swapops.h | 5 ++ mm/memory-failure.c | 150 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 165 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bf7fe87a7e88..22791aadc085 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + int ret; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); if (!p->mce_ripv) flags |= MF_MUST_KILL; - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) && - !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { + ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) { set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); sync_core(); return; } + /* + * -EHWPOISON from memory_failure() means that it already sent SIGBUS + * to the current process with the proper error info, so no need to + * send SIGBUS here again. + */ + if (ret == -EHWPOISON) + return; + if (p->mce_vaddr != (void __user *)-1l) { force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT); } else { diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6430a94c6981..5907205c712c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -330,6 +330,11 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } +static inline unsigned long hwpoison_entry_to_pfn(swp_entry_t entry) +{ + return swp_offset(entry); +} + static inline void num_poisoned_pages_inc(void) { atomic_long_inc(&num_poisoned_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6f5f78885ab4..4d151ce3e50d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -554,6 +555,148 @@ static void collect_procs(struct page *page, struct list_head *tokill, collect_procs_file(page, tokill, force_early); } +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = hwpoison_entry_to_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + int ret = 0; + pte_t *ptep; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = (struct hwp_walk *)walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + mmap_read_unlock(p->mm); + return ret ? -EFAULT : -EHWPOISON; +} + static const char *action_name[] = { [MF_IGNORED] = "Ignored", [MF_FAILED] = "Failed", @@ -1267,7 +1410,10 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return -EHWPOISON; + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, page_to_pfn(head), flags); + return res; } num_poisoned_pages_inc(); @@ -1476,6 +1622,8 @@ try_again: pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); goto unlock_mutex; } -- cgit v1.2.3 From 426e5c429d16e4cd5ded46e21ff8e939bf8abd0f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:00 -0700 Subject: mm: memory_hotplug: factor out bootmem core functions to bootmem_info.c Patch series "Free some vmemmap pages of HugeTLB page", v23. This patch series will free some vmemmap pages(struct page structures) associated with each HugeTLB page when preallocated to save memory. In order to reduce the difficulty of the first version of code review. In this version, we disable PMD/huge page mapping of vmemmap if this feature was enabled. This acutely eliminates a bunch of the complex code doing page table manipulation. When this patch series is solid, we cam add the code of vmemmap page table manipulation in the future. The struct page structures (page structs) are used to describe a physical page frame. By default, there is an one-to-one mapping from a page frame to it's corresponding page struct. The HugeTLB pages consist of multiple base page size pages and is supported by many architectures. See hugetlbpage.rst in the Documentation directory for more details. On the x86 architecture, HugeTLB pages of size 2MB and 1GB are currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. For each base page, there is a corresponding page struct. Within the HugeTLB subsystem, only the first 4 page structs are used to contain unique information about a HugeTLB page. HUGETLB_CGROUP_MIN_ORDER provides this upper limit. The only 'useful' information in the remaining page structs is the compound_head field, and this field is the same for all tail pages. By removing redundant page structs for HugeTLB pages, memory can returned to the buddy allocator for other uses. When the system boot up, every 2M HugeTLB has 512 struct page structs which size is 8 pages(sizeof(struct page) * 512 / PAGE_SIZE). HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | -------------> | 2 | | | +-----------+ +-----------+ | | | 3 | -------------> | 3 | | | +-----------+ +-----------+ | | | 4 | -------------> | 4 | | 2MB | +-----------+ +-----------+ | | | 5 | -------------> | 5 | | | +-----------+ +-----------+ | | | 6 | -------------> | 6 | | | +-----------+ +-----------+ | | | 7 | -------------> | 7 | | | +-----------+ +-----------+ | | | | | | +-----------+ The value of page->compound_head is the same for all tail pages. The first page of page structs (page 0) associated with the HugeTLB page contains the 4 page structs necessary to describe the HugeTLB. The only use of the remaining pages of page structs (page 1 to page 7) is to point to page->compound_head. Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs will be used for each HugeTLB page. This will allow us to free the remaining 6 pages to the buddy allocator. Here is how things look after remapping. HugeTLB struct pages(8 pages) page frame(8 pages) +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ | | | 0 | -------------> | 0 | | | +-----------+ +-----------+ | | | 1 | -------------> | 1 | | | +-----------+ +-----------+ | | | 2 | ----------------^ ^ ^ ^ ^ ^ | | +-----------+ | | | | | | | | 3 | ------------------+ | | | | | | +-----------+ | | | | | | | 4 | --------------------+ | | | | 2MB | +-----------+ | | | | | | 5 | ----------------------+ | | | | +-----------+ | | | | | 6 | ------------------------+ | | | +-----------+ | | | | 7 | --------------------------+ | | +-----------+ | | | | | | +-----------+ When a HugeTLB is freed to the buddy system, we should allocate 6 pages for vmemmap pages and restore the previous mapping relationship. Apart from 2MB HugeTLB page, we also have 1GB HugeTLB page. It is similar to the 2MB HugeTLB page. We also can use this approach to free the vmemmap pages. In this case, for the 1GB HugeTLB page, we can save 4094 pages. This is a very substantial gain. On our server, run some SPDK/QEMU applications which will use 1024GB HugeTLB page. With this feature enabled, we can save ~16GB (1G hugepage)/~12GB (2MB hugepage) memory. Because there are vmemmap page tables reconstruction on the freeing/allocating path, it increases some overhead. Here are some overhead analysis. 1) Allocating 10240 2MB HugeTLB pages. a) With this patch series applied: # time echo 10240 > /proc/sys/vm/nr_hugepages real 0m0.166s user 0m0.000s sys 0m0.166s # bpftrace -e 'kprobe:alloc_fresh_huge_page { @start[tid] = nsecs; } kretprobe:alloc_fresh_huge_page /@start[tid]/ { @latency = hist(nsecs - @start[tid]); delete(@start[tid]); }' Attaching 2 probes... @latency: [8K, 16K) 5476 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [16K, 32K) 4760 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32K, 64K) 4 | | b) Without this patch series: # time echo 10240 > /proc/sys/vm/nr_hugepages real 0m0.067s user 0m0.000s sys 0m0.067s # bpftrace -e 'kprobe:alloc_fresh_huge_page { @start[tid] = nsecs; } kretprobe:alloc_fresh_huge_page /@start[tid]/ { @latency = hist(nsecs - @start[tid]); delete(@start[tid]); }' Attaching 2 probes... @latency: [4K, 8K) 10147 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [8K, 16K) 93 | | Summarize: this feature is about ~2x slower than before. 2) Freeing 10240 2MB HugeTLB pages. a) With this patch series applied: # time echo 0 > /proc/sys/vm/nr_hugepages real 0m0.213s user 0m0.000s sys 0m0.213s # bpftrace -e 'kprobe:free_pool_huge_page { @start[tid] = nsecs; } kretprobe:free_pool_huge_page /@start[tid]/ { @latency = hist(nsecs - @start[tid]); delete(@start[tid]); }' Attaching 2 probes... @latency: [8K, 16K) 6 | | [16K, 32K) 10227 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [32K, 64K) 7 | | b) Without this patch series: # time echo 0 > /proc/sys/vm/nr_hugepages real 0m0.081s user 0m0.000s sys 0m0.081s # bpftrace -e 'kprobe:free_pool_huge_page { @start[tid] = nsecs; } kretprobe:free_pool_huge_page /@start[tid]/ { @latency = hist(nsecs - @start[tid]); delete(@start[tid]); }' Attaching 2 probes... @latency: [4K, 8K) 6805 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [8K, 16K) 3427 |@@@@@@@@@@@@@@@@@@@@@@@@@@ | [16K, 32K) 8 | | Summary: The overhead of __free_hugepage is about ~2-3x slower than before. Although the overhead has increased, the overhead is not significant. Like Mike said, "However, remember that the majority of use cases create HugeTLB pages at or shortly after boot time and add them to the pool. So, additional overhead is at pool creation time. There is no change to 'normal run time' operations of getting a page from or returning a page to the pool (think page fault/unmap)". Despite the overhead and in addition to the memory gains from this series. The following data is obtained by Joao Martins. Very thanks to his effort. There's an additional benefit which is page (un)pinners will see an improvement and Joao presumes because there are fewer memmap pages and thus the tail/head pages are staying in cache more often. Out of the box Joao saw (when comparing linux-next against linux-next + this series) with gup_test and pinning a 16G HugeTLB file (with 1G pages): get_user_pages(): ~32k -> ~9k unpin_user_pages(): ~75k -> ~70k Usually any tight loop fetching compound_head(), or reading tail pages data (e.g. compound_head) benefit a lot. There's some unpinning inefficiencies Joao was fixing[2], but with that in added it shows even more: unpin_user_pages(): ~27k -> ~3.8k [1] https://lore.kernel.org/linux-mm/20210409205254.242291-1-mike.kravetz@oracle.com/ [2] https://lore.kernel.org/linux-mm/20210204202500.26474-1-joao.m.martins@oracle.com/ This patch (of 9): Move bootmem info registration common API to individual bootmem_info.c. And we will use {get,put}_page_bootmem() to initialize the page for the vmemmap pages or free the vmemmap pages to buddy in the later patch. So move them out of CONFIG_MEMORY_HOTPLUG_SPARSE. This is just code movement without any functional change. Link: https://lkml.kernel.org/r/20210510030027.56044-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20210510030027.56044-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Reviewed-by: Miaohe Lin Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Alexander Viro Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Randy Dunlap Cc: Oliver Neukum Cc: Anshuman Khandual Cc: Joerg Roedel Cc: Mina Almasry Cc: David Rientjes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Barry Song Cc: HORIGUCHI NAOYA Cc: Joao Martins Cc: Xiongchun Duan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_64.c | 1 + arch/x86/mm/init_64.c | 3 +- include/linux/bootmem_info.h | 40 +++++++++++++ include/linux/memory_hotplug.h | 27 --------- mm/Makefile | 1 + mm/bootmem_info.c | 127 +++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 116 ------------------------------------- mm/sparse.c | 1 + 8 files changed, 172 insertions(+), 144 deletions(-) create mode 100644 include/linux/bootmem_info.h create mode 100644 mm/bootmem_info.c (limited to 'arch/x86') diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 06e938d03f3b..1b23639e2fcd 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e527d829e1ed..3aaf1d30c777 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1623,7 +1624,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return err; } -#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long nr_pages) { diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h new file mode 100644 index 000000000000..4ed6dee1adc9 --- /dev/null +++ b/include/linux/bootmem_info.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_BOOTMEM_INFO_H +#define __LINUX_BOOTMEM_INFO_H + +#include + +/* + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. + */ +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; + +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE +void __init register_page_bootmem_info_node(struct pglist_data *pgdat); + +void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type); +void put_page_bootmem(struct page *page); +#else +static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) +{ +} + +static inline void put_page_bootmem(struct page *page) +{ +} + +static inline void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) +{ +} +#endif + +#endif /* __LINUX_BOOTMEM_INFO_H */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 28f32fd00fe9..a7fd2c3ccb77 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -18,18 +18,6 @@ struct vmem_altmap; #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); -/* - * Types for free bootmem stored in page->lru.next. These have to be in - * some random range in unsigned long space for debugging purposes. - */ -enum { - MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, - SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, - MIX_SECTION_INFO, - NODE_INFO, - MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, -}; - /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ @@ -222,17 +210,6 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) #endif /* CONFIG_NUMA */ #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ -#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE -extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat); -#else -static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) -{ -} -#endif -extern void put_page_bootmem(struct page *page); -extern void get_page_bootmem(unsigned long ingo, struct page *page, - unsigned long type); - void get_online_mems(void); void put_online_mems(void); @@ -260,10 +237,6 @@ static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} -static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) -{ -} - static inline int try_online_node(int nid) { return 0; diff --git a/mm/Makefile b/mm/Makefile index bf71e295e9f6..61ce71ddf7bb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -125,3 +125,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o +obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c new file mode 100644 index 000000000000..5b152dba7344 --- /dev/null +++ b/mm/bootmem_info.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Bootmem core functions. + * + * Copyright (c) 2020, Bytedance. + * + * Author: Muchun Song + * + */ +#include +#include +#include +#include +#include + +void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) +{ + page->freelist = (void *)type; + SetPagePrivate(page); + set_page_private(page, info); + page_ref_inc(page); +} + +void put_page_bootmem(struct page *page) +{ + unsigned long type; + + type = (unsigned long) page->freelist; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); + + if (page_ref_dec_return(page) == 1) { + page->freelist = NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + INIT_LIST_HEAD(&page->lru); + free_reserved_page(page); + } +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); + +} +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +void __init register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + + /* register section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other nodes. + */ + if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 974a565797d8..ae7a07b02049 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -154,122 +154,6 @@ static void release_memory_resource(struct resource *res) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) -{ - page->freelist = (void *)type; - SetPagePrivate(page); - set_page_private(page, info); - page_ref_inc(page); -} - -void put_page_bootmem(struct page *page) -{ - unsigned long type; - - type = (unsigned long) page->freelist; - BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || - type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - - if (page_ref_dec_return(page) == 1) { - page->freelist = NULL; - ClearPagePrivate(page); - set_page_private(page, 0); - INIT_LIST_HEAD(&page->lru); - free_reserved_page(page); - } -} - -#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE -#ifndef CONFIG_SPARSEMEM_VMEMMAP -static void register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - /* Get section's memmap address */ - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - /* - * Get page for the memmap's phys address - * XXX: need more consideration for sparse_vmemmap... - */ - page = virt_to_page(memmap); - mapsize = sizeof(struct page) * PAGES_PER_SECTION; - mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; - - /* remember memmap's page */ - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, SECTION_INFO); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); - -} -#else /* CONFIG_SPARSEMEM_VMEMMAP */ -static void register_page_bootmem_info_section(unsigned long start_pfn) -{ - unsigned long mapsize, section_nr, i; - struct mem_section *ms; - struct page *page, *memmap; - struct mem_section_usage *usage; - - section_nr = pfn_to_section_nr(start_pfn); - ms = __nr_to_section(section_nr); - - memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - - register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - - usage = ms->usage; - page = virt_to_page(usage); - - mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; - - for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_SECTION_INFO); -} -#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ - -void __init register_page_bootmem_info_node(struct pglist_data *pgdat) -{ - unsigned long i, pfn, end_pfn, nr_pages; - int node = pgdat->node_id; - struct page *page; - - nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; - page = virt_to_page(pgdat); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - - pfn = pgdat->node_start_pfn; - end_pfn = pgdat_end_pfn(pgdat); - - /* register section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - /* - * Some platforms can assign the same pfn to multiple nodes - on - * node0 as well as nodeN. To avoid registering a pfn against - * multiple nodes we check that this pfn does not already - * reside in some other nodes. - */ - if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) - register_page_bootmem_info_section(pfn); - } -} -#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ - static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, const char *reason) { diff --git a/mm/sparse.c b/mm/sparse.c index 7272f7a1449d..6326cdf36c4f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" #include -- cgit v1.2.3 From 6be24bed9da367c29b04e6fba8c9f27db39aa665 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:04 -0700 Subject: mm: hugetlb: introduce a new config HUGETLB_PAGE_FREE_VMEMMAP The option HUGETLB_PAGE_FREE_VMEMMAP allows for the freeing of some vmemmap pages associated with pre-allocated HugeTLB pages. For example, on X86_64 6 vmemmap pages of size 4KB each can be saved for each 2MB HugeTLB page. 4094 vmemmap pages of size 4KB each can be saved for each 1GB HugeTLB page. When a HugeTLB page is allocated or freed, the vmemmap array representing the range associated with the page will need to be remapped. When a page is allocated, vmemmap pages are freed after remapping. When a page is freed, previously discarded vmemmap pages must be allocated before remapping. The config option is introduced early so that supporting code can be written to depend on the option. The initial version of the code only provides support for x86-64. If config HAVE_BOOTMEM_INFO_NODE is enabled, the freeing vmemmap page code denpend on it to free vmemmap pages. Otherwise, just use free_reserved_page() to free vmemmmap pages. The routine register_page_bootmem_info() is used to register bootmem info. Therefore, make sure register_page_bootmem_info is enabled if HUGETLB_PAGE_FREE_VMEMMAP is defined. Link: https://lkml.kernel.org/r/20210510030027.56044-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Acked-by: Mike Kravetz Reviewed-by: Miaohe Lin Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Reviewed-by: Balbir Singh Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Barry Song Cc: Borislav Petkov Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 2 +- fs/Kconfig | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3aaf1d30c777..65ea58527176 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1270,7 +1270,7 @@ static struct kcore_list kcore_vsyscall; static void __init register_page_bootmem_info(void) { -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) int i; for_each_online_node(i) diff --git a/fs/Kconfig b/fs/Kconfig index 141a856c50e7..58a53455d1fe 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -240,6 +240,11 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config HUGETLB_PAGE_FREE_VMEMMAP + def_bool HUGETLB_PAGE + depends on X86_64 + depends on SPARSEMEM_VMEMMAP + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS -- cgit v1.2.3 From e9fdff87e893ec5b7c32836675db80cf691b2a8b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:47:25 -0700 Subject: mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap Add a kernel parameter hugetlb_free_vmemmap to enable the feature of freeing unused vmemmap pages associated with each hugetlb page on boot. We disable PMD mapping of vmemmap pages for x86-64 arch when this feature is enabled. Because vmemmap_remap_free() depends on vmemmap being base page mapped. Link: https://lkml.kernel.org/r/20210510030027.56044-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Barry Song Reviewed-by: Miaohe Lin Tested-by: Chen Huang Tested-by: Bodeddula Balasubramaniam Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Balbir Singh Cc: Borislav Petkov Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: HORIGUCHI NAOYA Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joao Martins Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mina Almasry Cc: Oliver Neukum Cc: Paul E. McKenney Cc: Pawan Gupta Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Thomas Gleixner Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ Documentation/admin-guide/mm/hugetlbpage.rst | 3 +++ arch/x86/mm/init_64.c | 8 ++++++-- include/linux/hugetlb.h | 19 +++++++++++++++++++ mm/hugetlb_vmemmap.c | 24 ++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 566c4b9af3cd..b787a9953f2e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1567,6 +1567,23 @@ Documentation/admin-guide/mm/hugetlbpage.rst. Format: size[KMG] + hugetlb_free_vmemmap= + [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP + enabled. + Allows heavy hugetlb users to free up some more + memory (6 * PAGE_SIZE for each 2MB hugetlb page). + This feauture is not free though. Large page + tables are not used to back vmemmap pages which + can lead to a performance degradation for some + workloads. Also there will be memory allocation + required when hugetlb pages are freed from the + pool which can lead to corner cases under heavy + memory pressure. + Format: { on | off (default) } + + on: enable the feature + off: disable the feature + hung_task_panic= [KNL] Should the hung task detector generate panics. Format: 0 | 1 diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index 6988895d09a8..8abaeb144e44 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -153,6 +153,9 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. +hugetlb_free_vmemmap + When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing + unused vmemmap pages associated with each HugeTLB page. When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` indicates the current number of pre-allocated huge pages of the default size. diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 65ea58527176..9d9d18d0c2a1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1609,7 +1610,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); - if (end - start < PAGES_PER_SECTION * sizeof(struct page)) + if ((is_hugetlb_free_vmemmap_enabled() && !altmap) || + end - start < PAGES_PER_SECTION * sizeof(struct page)) err = vmemmap_populate_basepages(start, end, node, NULL); else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); @@ -1637,6 +1639,8 @@ void register_page_bootmem_memmap(unsigned long section_nr, pmd_t *pmd; unsigned int nr_pmd_pages; struct page *page; + bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) || + is_hugetlb_free_vmemmap_enabled(); for (; addr < end; addr = next) { pte_t *pte = NULL; @@ -1662,7 +1666,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (!boot_cpu_has(X86_FEATURE_PSE)) { + if (base_mapping) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3578d9d708fe..9ad99848f9f0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -892,6 +892,20 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; + +static inline bool is_hugetlb_free_vmemmap_enabled(void) +{ + return hugetlb_free_vmemmap_enabled; +} +#else +static inline bool is_hugetlb_free_vmemmap_enabled(void) +{ + return false; +} +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1046,6 +1060,11 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline bool is_hugetlb_free_vmemmap_enabled(void) +{ + return false; +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index a897c7778246..3070e1465b1b 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -168,6 +168,8 @@ * (last) level. So this type of HugeTLB page can be optimized only when its * size of the struct page structs is greater than 2 pages. */ +#define pr_fmt(fmt) "HugeTLB: " fmt + #include "hugetlb_vmemmap.h" /* @@ -180,6 +182,28 @@ #define RESERVE_VMEMMAP_NR 2U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +bool hugetlb_free_vmemmap_enabled; + +static int __init early_hugetlb_free_vmemmap_param(char *buf) +{ + /* We cannot optimize if a "struct page" crosses page boundaries. */ + if ((!is_power_of_2(sizeof(struct page)))) { + pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n"); + return 0; + } + + if (!buf) + return -EINVAL; + + if (!strcmp(buf, "on")) + hugetlb_free_vmemmap_enabled = true; + else if (strcmp(buf, "off")) + return -EINVAL; + + return 0; +} +early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param); + static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h) { return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT; -- cgit v1.2.3 From c742199a014de23ee92055c2473d91fe5561ffdf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 30 Jun 2021 18:48:03 -0700 Subject: mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge For architectures with no PMD and/or no PUD, add stubs similar to what we have for architectures without P4D. [christophe.leroy@csgroup.eu: arm64: define only {pud/pmd}_{set/clear}_huge when useful] Link: https://lkml.kernel.org/r/73ec95f40cafbbb69bdfb43a7f53876fd845b0ce.1620990479.git.christophe.leroy@csgroup.eu [christophe.leroy@csgroup.eu: x86: define only {pud/pmd}_{set/clear}_huge when useful] Link: https://lkml.kernel.org/r/7fbf1b6bc3e15c07c24fa45278d57064f14c896b.1620930415.git.christophe.leroy@csgroup.eu Link: https://lkml.kernel.org/r/5ac5976419350e8e048d463a64cae449eb3ba4b0.1620795204.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Uladzislau Rezki Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 20 ++++++++++++-------- arch/x86/mm/pgtable.c | 34 +++++++++++++++++++--------------- include/linux/pgtable.h | 26 +++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 89b66ef43a0f..add67deb4910 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1338,6 +1338,7 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } +#if CONFIG_PGTABLE_LEVELS > 3 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); @@ -1352,6 +1353,16 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) return 1; } +int pud_clear_huge(pud_t *pudp) +{ + if (!pud_sect(READ_ONCE(*pudp))) + return 0; + pud_clear(pudp); + return 1; +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); @@ -1366,14 +1377,6 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } -int pud_clear_huge(pud_t *pudp) -{ - if (!pud_sect(READ_ONCE(*pudp))) - return 0; - pud_clear(pudp); - return 1; -} - int pmd_clear_huge(pmd_t *pmdp) { if (!pmd_sect(READ_ONCE(*pmdp))) @@ -1381,6 +1384,7 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } +#endif int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d27cf69e811d..1303ff6ef7be 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -682,6 +682,7 @@ int p4d_clear_huge(p4d_t *p4d) } #endif +#if CONFIG_PGTABLE_LEVELS > 3 /** * pud_set_huge - setup kernel PUD mapping * @@ -720,6 +721,23 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 /** * pmd_set_huge - setup kernel PMD mapping * @@ -750,21 +768,6 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } -/** - * pud_clear_huge - clear kernel PUD mapping when it is set - * - * Returns 1 on success and 0 on failure (no PUD map is found). - */ -int pud_clear_huge(pud_t *pud) -{ - if (pud_large(*pud)) { - pud_clear(pud); - return 1; - } - - return 0; -} - /** * pmd_clear_huge - clear kernel PMD mapping when it is set * @@ -779,6 +782,7 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } +#endif #ifdef CONFIG_X86_64 /** diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c32600c9e1ad..2b0d02291178 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1379,10 +1379,34 @@ static inline int p4d_clear_huge(p4d_t *p4d) } #endif /* !__PAGETABLE_P4D_FOLDED */ +#ifndef __PAGETABLE_PUD_FOLDED int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); -int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); +#else +static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +static inline int pud_clear_huge(pud_t *pud) +{ + return 0; +} +#endif /* !__PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pmd_clear_huge(pmd_t *pmd); +#else +static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} +static inline int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} +#endif /* !__PAGETABLE_PMD_FOLDED */ + int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); -- cgit v1.2.3 From 2d7a21715f25122779e2bed17db8c57aa01e922f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 30 Jun 2021 18:48:25 -0700 Subject: mm: sparsemem: use huge PMD mapping for vmemmap pages The preparation of splitting huge PMD mapping of vmemmap pages is ready, so switch the mapping from PTE to PMD. Link: https://lkml.kernel.org/r/20210616094915.34432-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Chen Huang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Michal Hocko Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 7 ------- arch/x86/mm/init_64.c | 8 ++------ include/linux/hugetlb.h | 25 ++++++------------------- mm/memory_hotplug.c | 2 +- 4 files changed, 9 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 597572ceb8ba..7a7da02b3bc6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1572,13 +1572,6 @@ enabled. Allows heavy hugetlb users to free up some more memory (6 * PAGE_SIZE for each 2MB hugetlb page). - This feauture is not free though. Large page - tables are not used to back vmemmap pages which - can lead to a performance degradation for some - workloads. Also there will be memory allocation - required when hugetlb pages are freed from the - pool which can lead to corner cases under heavy - memory pressure. Format: { on | off (default) } on: enable the feature diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d9d18d0c2a1..65ea58527176 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -1610,8 +1609,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); - if ((is_hugetlb_free_vmemmap_enabled() && !altmap) || - end - start < PAGES_PER_SECTION * sizeof(struct page)) + if (end - start < PAGES_PER_SECTION * sizeof(struct page)) err = vmemmap_populate_basepages(start, end, node, NULL); else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); @@ -1639,8 +1637,6 @@ void register_page_bootmem_memmap(unsigned long section_nr, pmd_t *pmd; unsigned int nr_pmd_pages; struct page *page; - bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) || - is_hugetlb_free_vmemmap_enabled(); for (; addr < end; addr = next) { pte_t *pte = NULL; @@ -1666,7 +1662,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); - if (base_mapping) { + if (!boot_cpu_has(X86_FEATURE_PSE)) { next = (addr + PAGE_SIZE) & PAGE_MASK; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cfde3bec2261..f11ba701e199 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -895,20 +895,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif -#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP -extern bool hugetlb_free_vmemmap_enabled; - -static inline bool is_hugetlb_free_vmemmap_enabled(void) -{ - return hugetlb_free_vmemmap_enabled; -} -#else -static inline bool is_hugetlb_free_vmemmap_enabled(void) -{ - return false; -} -#endif - #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1063,13 +1049,14 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } - -static inline bool is_hugetlb_free_vmemmap_enabled(void) -{ - return false; -} #endif /* CONFIG_HUGETLB_PAGE */ +#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP +extern bool hugetlb_free_vmemmap_enabled; +#else +#define hugetlb_free_vmemmap_enabled false +#endif + static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 29d1d742b565..492560b61999 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1056,7 +1056,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size) * populate a single PMD. */ return memmap_on_memory && - !is_hugetlb_free_vmemmap_enabled() && + !hugetlb_free_vmemmap_enabled && IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && size == memory_block_size_bytes() && IS_ALIGNED(vmemmap_size, PMD_SIZE) && -- cgit v1.2.3 From cebc774fdc9cb39b959968fbfd7aabe7a8a5154c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 30 Jun 2021 18:51:58 -0700 Subject: mm/thp: make ARCH_ENABLE_SPLIT_PMD_PTLOCK dependent on PGTABLE_LEVELS > 2 ARCH_ENABLE_SPLIT_PMD_PTLOCK is irrelevant unless there are more than two page table levels including PMD (also per Documentation/vm/split_page_table_lock.rst). Make this dependency explicit on remaining platforms i.e x86 and s390 where ARCH_ENABLE_SPLIT_PMD_PTLOCK is subscribed. Link: https://lkml.kernel.org/r/1622013501-20409-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Gerald Schaefer # s390 Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 707afbcd81c2..4098637a52fc 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -62,7 +62,7 @@ config S390 select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE - select ARCH_ENABLE_SPLIT_PMD_PTLOCK + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d523ff70fe7..8c5d86655180 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG - select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE) select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE -- cgit v1.2.3 From 63703f37aa09e2c12c0ff25afbf5c460b21bfe4c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 30 Jun 2021 18:52:20 -0700 Subject: mm: generalize ZONE_[DMA|DMA32] ZONE_[DMA|DMA32] configs have duplicate definitions on platforms that subscribe to them. Instead, just make them generic options which can be selected on applicable platforms. Also only x86/arm64 architectures could enable both ZONE_DMA and ZONE_DMA32 if EXPERT, add ARCH_HAS_ZONE_DMA_SET to make dma zone configurable and visible on the two architectures. Link: https://lkml.kernel.org/r/20210528074557.17768-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Catalin Marinas [arm64] Acked-by: Geert Uytterhoeven [m68k] Acked-by: Mike Rapoport Acked-by: Palmer Dabbelt [RISC-V] Acked-by: Michal Simek [microblaze] Acked-by: Michael Ellerman [powerpc] Cc: Catalin Marinas Cc: Will Deacon Cc: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: "David S. Miller" Cc: Ingo Molnar Cc: Borislav Petkov Cc: Richard Henderson Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 5 +---- arch/arm/Kconfig | 3 --- arch/arm64/Kconfig | 9 +-------- arch/ia64/Kconfig | 4 +--- arch/m68k/Kconfig | 5 +---- arch/microblaze/Kconfig | 4 +--- arch/mips/Kconfig | 7 ------- arch/powerpc/Kconfig | 4 ---- arch/powerpc/platforms/Kconfig.cputype | 1 + arch/riscv/Kconfig | 5 +---- arch/s390/Kconfig | 4 +--- arch/sparc/Kconfig | 5 +---- arch/x86/Kconfig | 15 ++------------- mm/Kconfig | 12 ++++++++++++ 14 files changed, 23 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 8954216b9956..77d3280dc678 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -40,6 +40,7 @@ config ALPHA select MMU_GATHER_NO_RANGE select SET_FS select SPARSEMEM_EXTREME if SPARSEMEM + select ZONE_DMA help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, @@ -65,10 +66,6 @@ config GENERIC_CALIBRATE_DELAY bool default y -config ZONE_DMA - bool - default y - config GENERIC_ISA_DMA bool default y diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 24804f11302d..000c3f80b58e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -218,9 +218,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_MAY_HAVE_PC_FDC bool -config ZONE_DMA - bool - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 84f0216d3c5c..39b378ee56ce 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -42,6 +42,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION @@ -306,14 +307,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ZONE_DMA - bool "Support DMA zone" if EXPERT - default y - -config ZONE_DMA32 - bool "Support DMA32 zone" if EXPERT - default y - config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index f4ff6bd5fc13..cf425c2c63af 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -60,6 +60,7 @@ config IA64 select NUMA if !FLATMEM select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select SET_FS + select ZONE_DMA32 default y help The Itanium Processor Family is Intel's 64-bit successor to @@ -72,9 +73,6 @@ config 64BIT select ATA_NONSTANDARD if ATA default y -config ZONE_DMA32 - def_bool y - config MMU bool default y diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 372e4e69c43a..05a729c6ad7f 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -34,6 +34,7 @@ config M68K select SET_FS select UACCESS_MEMCPY if !MMU select VIRT_TO_BUS + select ZONE_DMA config CPU_BIG_ENDIAN def_bool y @@ -62,10 +63,6 @@ config TIME_LOW_RES config NO_IOPORT_MAP def_bool y -config ZONE_DMA - bool - default y - config HZ int default 1000 if CLEOPATRA diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 0660f47012bc..14a67a42fcae 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -43,6 +43,7 @@ config MICROBLAZE select MMU_GATHER_NO_RANGE select SPARSE_IRQ select SET_FS + select ZONE_DMA # Endianness selection choice @@ -60,9 +61,6 @@ config CPU_LITTLE_ENDIAN endchoice -config ZONE_DMA - def_bool y - config ARCH_HAS_ILOG2_U32 def_bool n diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index df02e36a4a8d..d69f4b17d0a6 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3274,13 +3274,6 @@ config I8253 select CLKSRC_I8253 select CLKEVT_I8253 select MIPS_EXTERNAL_TIMER - -config ZONE_DMA - bool - -config ZONE_DMA32 - bool - endmenu config TRAD_SIGNALS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 98206f3ebae0..df46324d5090 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -403,10 +403,6 @@ config PPC_ADV_DEBUG_DAC_RANGE config PPC_DAWR bool -config ZONE_DMA - bool - default y if PPC_BOOK3E_64 - config PGTABLE_LEVELS int default 2 if !PPC64 diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index f998e655b570..7d271de8fcbd 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -111,6 +111,7 @@ config PPC_BOOK3E_64 select PPC_FPU # Make it a choice ? select PPC_SMP_MUXED_IPI select PPC_DOORBELL + select ZONE_DMA endchoice diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 15f9490a7aad..469a70bd8da6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -104,6 +104,7 @@ config RISCV select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select UACCESS_MEMCPY if !MMU + select ZONE_DMA32 if 64BIT config ARCH_MMAP_RND_BITS_MIN default 18 if 64BIT @@ -133,10 +134,6 @@ config MMU Select if you want MMU-based virtualised addressing space support by paged memory management. If unsure, say 'Y'. -config ZONE_DMA32 - bool - default y if 64BIT - config VA_BITS int default 32 if 32BIT diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4098637a52fc..9e14acacb884 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -2,9 +2,6 @@ config MMU def_bool y -config ZONE_DMA - def_bool y - config CPU_BIG_ENDIAN def_bool y @@ -210,6 +207,7 @@ config S390 select THREAD_INFO_IN_TASK select TTY select VIRT_CPU_ACCOUNTING + select ZONE_DMA # Note: keep the above list sorted alphabetically config SCHED_OMIT_FRAME_POINTER diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c72f52c704cd..c5fa7932b550 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -59,6 +59,7 @@ config SPARC32 select CLZ_TAB select HAVE_UID16 select OLD_SIGACTION + select ZONE_DMA config SPARC64 def_bool 64BIT @@ -141,10 +142,6 @@ config HIGHMEM default y if SPARC32 select KMAP_LOCAL -config ZONE_DMA - bool - default y if SPARC32 - config GENERIC_ISA_DMA bool default y if SPARC32 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8c5d86655180..5f875112f086 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,6 +33,7 @@ config X86_64 select NEED_DMA_MAP_STATE select SWIOTLB select ARCH_HAS_ELFCORE_COMPAT + select ZONE_DMA32 config FORCE_DYNAMIC_FTRACE def_bool y @@ -93,6 +94,7 @@ config X86 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_DEBUG_WX + select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT @@ -343,9 +345,6 @@ config ARCH_SUSPEND_POSSIBLE config ARCH_WANT_GENERAL_HUGETLB def_bool y -config ZONE_DMA32 - def_bool y if X86_64 - config AUDIT_ARCH def_bool y if X86_64 @@ -393,16 +392,6 @@ config CC_HAS_SANE_STACKPROTECTOR menu "Processor type and features" -config ZONE_DMA - bool "DMA memory allocation support" if EXPERT - default y - help - DMA memory allocation support allows devices with less than 32-bit - addressing to allocate within the first 16MB of address space. - Disable if no such devices will be used. - - If unsure, say Y. - config SMP bool "Symmetric multi-processing support" help diff --git a/mm/Kconfig b/mm/Kconfig index a0b2c37f851b..a02498c0e13d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -761,6 +761,18 @@ config ARCH_HAS_CACHE_LINE_SIZE config ARCH_HAS_PTE_DEVMAP bool +config ARCH_HAS_ZONE_DMA_SET + bool + +config ZONE_DMA + bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET + default y if ARM64 || X86 + +config ZONE_DMA32 + bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET + depends on !X86_32 + default y if ARM64 + config ZONE_DEVICE bool "Device memory (pmem, HMM, etc...) hotplug support" depends on MEMORY_HOTPLUG -- cgit v1.2.3 From fac7757e1fb05b75c8e22d4f8fe2f6c9c4d7edca Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 30 Jun 2021 18:53:13 -0700 Subject: mm: define default value for FIRST_USER_ADDRESS Currently most platforms define FIRST_USER_ADDRESS as 0UL duplication the same code all over. Instead just define a generic default value (i.e 0UL) for FIRST_USER_ADDRESS and let the platforms override when required. This makes it much cleaner with reduced code. The default FIRST_USER_ADDRESS here would be skipped in when the given platform overrides its value via . Link: https://lkml.kernel.org/r/1620615725-24623-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Geert Uytterhoeven [m68k] Acked-by: Guo Ren [csky] Acked-by: Stafford Horne [openrisc] Acked-by: Catalin Marinas [arm64] Acked-by: Mike Rapoport Acked-by: Palmer Dabbelt [RISC-V] Cc: Richard Henderson Cc: Vineet Gupta Cc: Catalin Marinas Cc: Will Deacon Cc: Guo Ren Cc: Brian Cain Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Thomas Bogendoerfer Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Christophe Leroy Cc: Paul Walmsley Cc: Heiko Carstens Cc: Yoshinori Sato Cc: "David S. Miller" Cc: Jeff Dike Cc: Thomas Gleixner Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 1 - arch/arc/include/asm/pgtable.h | 6 ------ arch/arm64/include/asm/pgtable.h | 2 -- arch/csky/include/asm/pgtable.h | 1 - arch/hexagon/include/asm/pgtable.h | 3 --- arch/ia64/include/asm/pgtable.h | 1 - arch/m68k/include/asm/pgtable_mm.h | 1 - arch/microblaze/include/asm/pgtable.h | 2 -- arch/mips/include/asm/pgtable-32.h | 1 - arch/mips/include/asm/pgtable-64.h | 1 - arch/nios2/include/asm/pgtable.h | 2 -- arch/openrisc/include/asm/pgtable.h | 1 - arch/parisc/include/asm/pgtable.h | 2 -- arch/powerpc/include/asm/book3s/pgtable.h | 1 - arch/powerpc/include/asm/nohash/32/pgtable.h | 1 - arch/powerpc/include/asm/nohash/64/pgtable.h | 2 -- arch/riscv/include/asm/pgtable.h | 2 -- arch/s390/include/asm/pgtable.h | 2 -- arch/sh/include/asm/pgtable.h | 2 -- arch/sparc/include/asm/pgtable_32.h | 1 - arch/sparc/include/asm/pgtable_64.h | 3 --- arch/um/include/asm/pgtable-2level.h | 1 - arch/um/include/asm/pgtable-3level.h | 1 - arch/x86/include/asm/pgtable_types.h | 2 -- arch/xtensa/include/asm/pgtable.h | 1 - include/linux/pgtable.h | 9 +++++++++ 26 files changed, 9 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index e1757b7cfe3d..ff690846465e 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -46,7 +46,6 @@ struct vm_area_struct; #define PTRS_PER_PMD (1UL << (PAGE_SHIFT-3)) #define PTRS_PER_PGD (1UL << (PAGE_SHIFT-3)) #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL /* Number of pointers that fit on a page: this will go away. */ #define PTRS_PER_PAGE (1UL << (PAGE_SHIFT-3)) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 5878846f00cf..577682e8cc7f 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -222,12 +222,6 @@ */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -/* - * No special requirements for lowest virtual address we permit any user space - * mapping to be mapped at. - */ -#define FIRST_USER_ADDRESS 0UL - /**************************************************************** * Bucket load of VM Helpers diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0b10204e72fc..25f5c04b43ce 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -26,8 +26,6 @@ #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) -#define FIRST_USER_ADDRESS 0UL - #ifndef __ASSEMBLY__ #include diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 0d60367b6bfa..151607ed5158 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -14,7 +14,6 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (PAGE_OFFSET/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL /* * C-SKY is two-level paging structure: diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index dbb22b80b8c4..e4979508cddf 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -155,9 +155,6 @@ extern unsigned long _dflt_cache_att; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ -/* Seems to be zero even in architectures where the zero page is firewalled? */ -#define FIRST_USER_ADDRESS 0UL - /* HUGETLB not working currently */ #ifdef CONFIG_HUGETLB_PAGE #define pte_mkhuge(pte) __pte((pte_val(pte) & ~0x3) | HVM_HUGEPAGE_SIZE) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index d765fd948fae..3f5dbbd8b9d8 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -128,7 +128,6 @@ #define PTRS_PER_PGD_SHIFT PTRS_PER_PTD_SHIFT #define PTRS_PER_PGD (1UL << PTRS_PER_PGD_SHIFT) #define USER_PTRS_PER_PGD (5*PTRS_PER_PGD/8) /* regions 0-4 are user regions */ -#define FIRST_USER_ADDRESS 0UL /* * All the normal masks have the "page accessed" bits on, as any time diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index aca22c2c1ee2..143ba7de9bda 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -72,7 +72,6 @@ #define PTRS_PER_PGD 128 #endif #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL /* Virtual address region for use by kernel_map() */ #ifdef CONFIG_SUN3 diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 9ae8d2c17dd5..71cd547655d9 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -25,8 +25,6 @@ extern int mem_init_done; #include #include -#define FIRST_USER_ADDRESS 0UL - extern unsigned long va_to_phys(unsigned long address); extern pte_t *va_to_pte(unsigned long address); diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 6c0532d7b211..95df9c293d8d 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -93,7 +93,6 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, #endif #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL #define VMALLOC_START MAP_BASE diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 1e7d6ce9d8d6..046465906c82 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -137,7 +137,6 @@ #define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) #define USER_PTRS_PER_PGD ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1) -#define FIRST_USER_ADDRESS 0UL /* * TLB refill handlers also map the vmalloc area into xuseg. Avoid diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 2600d76c310c..4a995fa628ee 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -24,8 +24,6 @@ #include #include -#define FIRST_USER_ADDRESS 0UL - #define VMALLOC_START CONFIG_NIOS2_KERNEL_MMU_REGION_BASE #define VMALLOC_END (CONFIG_NIOS2_KERNEL_REGION_BASE - 1) diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 9425bedab4fc..4ac591c9ca33 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -73,7 +73,6 @@ extern void paging_init(void); */ #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL /* * Kernels own virtual memory area. diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 39017210dbf0..7f33c29764cc 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -171,8 +171,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) * pgd entries used up by user/kernel: */ -#define FIRST_USER_ADDRESS 0UL - /* NB: The tlb miss handlers make certain assumptions about the order */ /* of the following bits, so be careful (One example, bits 25-31 */ /* are moved together in one instruction). */ diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index 0e1263455d73..ad130e15a126 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -8,7 +8,6 @@ #include #endif -#define FIRST_USER_ADDRESS 0UL #ifndef __ASSEMBLY__ /* Insert a PTE, top-level function is out of line. It uses an inline * low level function in the respective pgtable-* files diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 96522f7f0618..f06ae00f2a65 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -54,7 +54,6 @@ extern int icache_44x_need_flush; #define PGD_MASKED_BITS 0 #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 57cd3892bfe0..53fbfdfac93d 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -12,8 +12,6 @@ #include #include -#define FIRST_USER_ADDRESS 0UL - /* * Size of EA range mapped by our pagetables. */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 380cd3a7e548..62f3fe7368f3 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -536,8 +536,6 @@ void setup_bootmem(void); void paging_init(void); void misc_mem_init(void); -#define FIRST_USER_ADDRESS 0 - /* * ZERO_PAGE is a global shared page that is always zero, * used for zero-mapped memory areas, etc. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b38f7b781564..7c86bf03fc8f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -65,8 +65,6 @@ extern unsigned long zero_page_mask; /* TODO: s390 cannot support io_remap_pfn_range... */ -#define FIRST_USER_ADDRESS 0UL - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p.\n", __FILE__, __LINE__, (void *) pte_val(e)) #define pmd_ERROR(e) \ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 27751e9470df..d7ddb1ec86a0 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -59,8 +59,6 @@ static inline unsigned long long neff_sign_extend(unsigned long val) /* Entries per level */ #define PTRS_PER_PTE (PAGE_SIZE / (1 << PTE_MAGNITUDE)) -#define FIRST_USER_ADDRESS 0UL - #define PHYS_ADDR_MASK29 0x1fffffff #define PHYS_ADDR_MASK32 0xffffffff diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index a5cf79c149fe..0888bda245f5 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -48,7 +48,6 @@ unsigned long __init bootmem_init(unsigned long *pages_avail); #define PTRS_PER_PMD 64 #define PTRS_PER_PGD 256 #define USER_PTRS_PER_PGD PAGE_OFFSET / PGDIR_SIZE -#define FIRST_USER_ADDRESS 0UL #define PTE_SIZE (PTRS_PER_PTE*4) #define PAGE_NONE SRMMU_PAGE_NONE diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 2cd80a0a9795..a400e0f23046 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -95,9 +95,6 @@ bool kern_addr_valid(unsigned long addr); #define PTRS_PER_PUD (1UL << PUD_BITS) #define PTRS_PER_PGD (1UL << PGDIR_BITS) -/* Kernel has a separate 44bit address space. */ -#define FIRST_USER_ADDRESS 0UL - #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %p(%016lx) seen at (%pS)\n", \ __FILE__, __LINE__, &(e), pmd_val(e), __builtin_return_address(0)) diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h index 32106d31e4ab..8256ecc5b919 100644 --- a/arch/um/include/asm/pgtable-2level.h +++ b/arch/um/include/asm/pgtable-2level.h @@ -23,7 +23,6 @@ #define PTRS_PER_PTE 1024 #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) #define PTRS_PER_PGD 1024 -#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%08lx).\n", __FILE__, __LINE__, &(e), \ diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 7e6a4180db9d..9289a86643a9 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -41,7 +41,6 @@ #endif #define USER_PTRS_PER_PGD ((TASK_SIZE + (PGDIR_SIZE - 1)) / PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), \ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f24d7ef8fffa..40497a9020c6 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -7,8 +7,6 @@ #include -#define FIRST_USER_ADDRESS 0UL - #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ #define _PAGE_BIT_USER 2 /* userspace addressable */ diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index d7fc45c920c2..bd5aeb795567 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -59,7 +59,6 @@ #define PTRS_PER_PGD 1024 #define PGD_ORDER 0 #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_ADDRESS 0UL #define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) #ifdef CONFIG_MMU diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b0d02291178..69700e3e615f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -28,6 +28,15 @@ #define USER_PGTABLES_CEILING 0UL #endif +/* + * This defines the first usable user address. Platforms + * can override its value with custom FIRST_USER_ADDRESS + * defined in their respective . + */ +#ifndef FIRST_USER_ADDRESS +#define FIRST_USER_ADDRESS 0UL +#endif + /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * -- cgit v1.2.3 From 1c2f7d14d84f767a797558609eb034511e02f41e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 30 Jun 2021 18:53:59 -0700 Subject: mm/thp: define default pmd_pgtable() Currently most platforms define pmd_pgtable() as pmd_page() duplicating the same code all over. Instead just define a default value i.e pmd_page() for pmd_pgtable() and let platforms override when required via . All the existing platform that override pmd_pgtable() have been moved into their respective header in order to precede before the new generic definition. This makes it much cleaner with reduced code. Link: https://lkml.kernel.org/r/1623646133-20306-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Geert Uytterhoeven Acked-by: Mike Rapoport Cc: Nick Hu Cc: Richard Henderson Cc: Vineet Gupta Cc: Catalin Marinas Cc: Will Deacon Cc: Guo Ren Cc: Brian Cain Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Thomas Bogendoerfer Cc: Ley Foon Tan Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Christophe Leroy Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Yoshinori Sato Cc: "David S. Miller" Cc: Jeff Dike Cc: Thomas Gleixner Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 1 - arch/arc/include/asm/pgalloc.h | 2 -- arch/arc/include/asm/pgtable.h | 2 ++ arch/arm/include/asm/pgalloc.h | 1 - arch/arm64/include/asm/pgalloc.h | 1 - arch/csky/include/asm/pgalloc.h | 2 -- arch/hexagon/include/asm/pgtable.h | 1 - arch/ia64/include/asm/pgalloc.h | 1 - arch/m68k/include/asm/mcf_pgalloc.h | 2 -- arch/m68k/include/asm/mcf_pgtable.h | 2 ++ arch/m68k/include/asm/motorola_pgalloc.h | 1 - arch/m68k/include/asm/motorola_pgtable.h | 2 ++ arch/m68k/include/asm/sun3_pgalloc.h | 1 - arch/microblaze/include/asm/pgalloc.h | 2 -- arch/mips/include/asm/pgalloc.h | 1 - arch/nds32/include/asm/pgalloc.h | 5 ----- arch/nios2/include/asm/pgalloc.h | 1 - arch/openrisc/include/asm/pgalloc.h | 2 -- arch/parisc/include/asm/pgalloc.h | 1 - arch/powerpc/include/asm/pgalloc.h | 5 ----- arch/powerpc/include/asm/pgtable.h | 6 ++++++ arch/riscv/include/asm/pgalloc.h | 2 -- arch/s390/include/asm/pgalloc.h | 3 --- arch/s390/include/asm/pgtable.h | 3 +++ arch/sh/include/asm/pgalloc.h | 1 - arch/sparc/include/asm/pgalloc_32.h | 1 - arch/sparc/include/asm/pgalloc_64.h | 1 - arch/sparc/include/asm/pgtable_32.h | 2 ++ arch/sparc/include/asm/pgtable_64.h | 2 ++ arch/um/include/asm/pgalloc.h | 1 - arch/x86/include/asm/pgalloc.h | 2 -- arch/xtensa/include/asm/pgalloc.h | 2 -- include/linux/pgtable.h | 9 +++++++++ 33 files changed, 28 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 9c6a24fe493d..68be7adbfe58 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -18,7 +18,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte) { pmd_set(pmd, (pte_t *)(page_to_pa(pte) + PAGE_OFFSET)); } -#define pmd_pgtable(pmd) pmd_page(pmd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 6147db925248..a32ca3104ced 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h @@ -129,6 +129,4 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) -#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) - #endif /* _ASM_ARC_PGALLOC_H */ diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 577682e8cc7f..320cc0ae8a08 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -350,6 +350,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define kern_addr_valid(addr) (1) +#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) + /* * remap a physical page `pfn' of size `size' with page protection `prot' * into virtual address `from' diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index fdee1f04f4f3..a17f01235c29 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -143,7 +143,6 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) __pmd_populate(pmdp, page_to_phys(ptep), prot); } -#define pmd_pgtable(pmd) pmd_page(pmd) #endif /* CONFIG_MMU */ diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 31fbab3d6f99..8433a2058eb1 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -86,6 +86,5 @@ pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) VM_BUG_ON(mm == &init_mm); __pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN); } -#define pmd_pgtable(pmd) pmd_page(pmd) #endif diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index cd211aabbefd..bbbd0698b397 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -22,8 +22,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd(__pa(page_address(pte)))); } -#define pmd_pgtable(pmd) pmd_page(pmd) - extern void pgd_init(unsigned long *p); static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index e4979508cddf..18cd6ea9ab23 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -239,7 +239,6 @@ static inline int pmd_bad(pmd_t pmd) * pmd_page - converts a PMD entry to a page pointer */ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) -#define pmd_pgtable(pmd) pmd_page(pmd) /** * pte_none - check if pte is mapped diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index 9601cfe83c94..0fb2b6291d58 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -52,7 +52,6 @@ pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte) { pmd_val(*pmd_entry) = page_to_phys(pte); } -#define pmd_pgtable(pmd) pmd_page(pmd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index bc1228e00518..5c2c0a864524 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -32,8 +32,6 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) #define pmd_populate_kernel pmd_populate -#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT) - static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, unsigned long address) { diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 8d4ec05996c5..6f2b87d7a50d 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -150,6 +150,8 @@ #ifndef __ASSEMBLY__ +#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT) + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index b4fc3b4f6bb3..74a817d9387f 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -88,7 +88,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page { pmd_set(pmd, page); } -#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 8076467eff4b..a2908164ee6f 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -105,6 +105,8 @@ extern unsigned long mm_cachebits; #define __S110 PAGE_SHARED_C #define __S111 PAGE_SHARED_C +#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) + /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 000f64869b91..198036aff519 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -32,7 +32,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page { pmd_val(*pmd) = __pa((unsigned long)page_address(page)); } -#define pmd_pgtable(pmd) pmd_page(pmd) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index d56b9f670ad1..6c33b05f730f 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -28,8 +28,6 @@ static inline pgd_t *get_pgd(void) #define pgd_alloc(mm) get_pgd() -#define pmd_pgtable(pmd) pmd_page(pmd) - extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 8b18424b3120..dd53d0f79cb3 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -28,7 +28,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, { set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define pmd_pgtable(pmd) pmd_page(pmd) /* * Initialize a new pmd table with invalid pointers. diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index 85c117347c86..a08e1ebca70e 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h @@ -12,11 +12,6 @@ #define __HAVE_ARCH_PTE_ALLOC_ONE #include /* for pte_{alloc,free}_one */ -/* - * Since we have only two-level page tables, these are trivial - */ -#define pmd_pgtable(pmd) pmd_page(pmd) - extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index e6600d2a5ae0..3c4ae74d5798 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -25,7 +25,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, { set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define pmd_pgtable(pmd) pmd_page(pmd) /* * Initialize a new pmd table with invalid pointers. diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 88820299ecc4..b7b2b8d16fad 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -72,6 +72,4 @@ do { \ tlb_remove_page((tlb), (pte)); \ } while (0) -#define pmd_pgtable(pmd) pmd_page(pmd) - #endif diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index dda557085311..6a7e98e71f1d 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -69,6 +69,5 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) #define pmd_populate(mm, pmd, pte_page) \ pmd_populate_kernel(mm, pmd, page_address(pte_page)) -#define pmd_pgtable(pmd) pmd_page(pmd) #endif diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 6dd78a2dc03a..3360cad78ace 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -70,9 +70,4 @@ extern struct kmem_cache *pgtable_cache[]; #include #endif -static inline pgtable_t pmd_pgtable(pmd_t pmd) -{ - return (pgtable_t)pmd_page_vaddr(pmd); -} - #endif /* _ASM_POWERPC_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index c6a676714f04..5969743719bc 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -152,6 +152,12 @@ static inline bool p4d_is_leaf(p4d_t p4d) } #endif +#define pmd_pgtable pmd_pgtable +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + #ifdef CONFIG_PPC64 #define is_ioremap_addr is_ioremap_addr static inline bool is_ioremap_addr(const void *x) diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 23b1544e0ca5..0af6933a7100 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -38,8 +38,6 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) } #endif /* __PAGETABLE_PMD_FOLDED */ -#define pmd_pgtable(pmd) pmd_page(pmd) - static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 6b187cd72251..f14a555eff74 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -134,9 +134,6 @@ static inline void pmd_populate(struct mm_struct *mm, #define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte) -#define pmd_pgtable(pmd) \ - ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) - /* * page table entry allocation/free routines. */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7c86bf03fc8f..1f8f5da53262 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1709,4 +1709,7 @@ extern void s390_reset_cmma(struct mm_struct *mm); #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +#define pmd_pgtable(pmd) \ + ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) + #endif /* _S390_PAGE_H */ diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 0e6b0be25e33..a9e98233c4d4 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -30,7 +30,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, { set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define pmd_pgtable(pmd) pmd_page(pmd) #define __pte_free_tlb(tlb,pte,addr) \ do { \ diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 9d353e6dc5a9..4f73e87b22a3 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -51,7 +51,6 @@ static inline void free_pmd_fast(pmd_t * pmd) #define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) #define pmd_populate(mm, pmd, pte) pmd_set(pmd, pte) -#define pmd_pgtable(pmd) (pgtable_t)__pmd_page(pmd) void pmd_set(pmd_t *pmdp, pte_t *ptep); #define pmd_populate_kernel pmd_populate diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index a8dafc550985..7b5561d17ab1 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -67,7 +67,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage); #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) -#define pmd_pgtable(PMD) ((pte_t *)pmd_page_vaddr(PMD)) void pgtable_free(void *table, bool is_page); diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 0888bda245f5..ebaf374b55ab 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -432,4 +432,6 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, /* We provide our own get_unmapped_area to cope with VA holes for userland */ #define HAVE_ARCH_UNMAPPED_AREA +#define pmd_pgtable(pmd) ((pgtable_t)__pmd_page(pmd)) + #endif /* !(_SPARC_PGTABLE_H) */ diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index a400e0f23046..e0ee48ec3903 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1117,6 +1117,8 @@ extern unsigned long cmdline_memory_size; asmlinkage void do_sparc64_fault(struct pt_regs *regs); +#define pmd_pgtable(PMD) ((pte_t *)pmd_page_vaddr(PMD)) + #ifdef CONFIG_HUGETLB_PAGE #define pud_leaf_size pud_leaf_size diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 2bbf28cf3aa9..8ec7cd46dd96 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -19,7 +19,6 @@ set_pmd(pmd, __pmd(_PAGE_TABLE + \ ((unsigned long long)page_to_pfn(pte) << \ (unsigned long long) PAGE_SHIFT))) -#define pmd_pgtable(pmd) pmd_page(pmd) /* * Allocate and free page tables. diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 62ad61d6fefc..c7ec5bb88334 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -84,8 +84,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); } -#define pmd_pgtable(pmd) pmd_page(pmd) - #if CONFIG_PGTABLE_LEVELS > 2 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index d3a22da4d2c9..eeb2de3a89e5 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -25,7 +25,6 @@ (pmd_val(*(pmdp)) = ((unsigned long)ptep)) #define pmd_populate(mm, pmdp, page) \ (pmd_val(*(pmdp)) = ((unsigned long)page_to_virt(page))) -#define pmd_pgtable(pmd) pmd_page(pmd) static inline pgd_t* pgd_alloc(struct mm_struct *mm) @@ -63,7 +62,6 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) return page; } -#define pmd_pgtable(pmd) pmd_page(pmd) #endif /* CONFIG_MMU */ #endif /* _XTENSA_PGALLOC_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 69700e3e615f..e82660f7b9e4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -37,6 +37,15 @@ #define FIRST_USER_ADDRESS 0UL #endif +/* + * This defines the generic helper for accessing PMD page + * table page. Although platforms can still override this + * via their respective . + */ +#ifndef pmd_pgtable +#define pmd_pgtable(pmd) pmd_page(pmd) +#endif + /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * -- cgit v1.2.3 From f39650de687e35766572ac89dbcd16a5911e2f0a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Jun 2021 18:54:59 -0700 Subject: kernel.h: split out panic and oops helpers kernel.h is being used as a dump for all kinds of stuff for a long time. Here is the attempt to start cleaning it up by splitting out panic and oops helpers. There are several purposes of doing this: - dropping dependency in bug.h - dropping a loop by moving out panic_notifier.h - unload kernel.h from something which has its own domain At the same time convert users tree-wide to use new headers, although for the time being include new header back to kernel.h to avoid twisted indirected includes for existing users. [akpm@linux-foundation.org: thread_info.h needs limits.h] [andriy.shevchenko@linux.intel.com: ia64 fix] Link: https://lkml.kernel.org/r/20210520130557.55277-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20210511074137.33666-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Bjorn Andersson Co-developed-by: Andrew Morton Acked-by: Mike Rapoport Acked-by: Corey Minyard Acked-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Kees Cook Acked-by: Wei Liu Acked-by: Rasmus Villemoes Signed-off-by: Andrew Morton Acked-by: Sebastian Reichel Acked-by: Luis Chamberlain Acked-by: Stephen Boyd Acked-by: Thomas Bogendoerfer Acked-by: Helge Deller # parisc Signed-off-by: Linus Torvalds --- arch/alpha/kernel/setup.c | 2 +- arch/arm64/kernel/setup.c | 1 + arch/ia64/include/asm/pal.h | 1 + arch/mips/kernel/relocate.c | 1 + arch/mips/sgi-ip22/ip22-reset.c | 1 + arch/mips/sgi-ip32/ip32-reset.c | 1 + arch/parisc/kernel/pdc_chassis.c | 1 + arch/powerpc/kernel/setup-common.c | 1 + arch/s390/kernel/ipl.c | 1 + arch/sparc/kernel/sstate.c | 1 + arch/um/drivers/mconsole_kern.c | 1 + arch/um/kernel/um_arch.c | 1 + arch/x86/include/asm/desc.h | 1 + arch/x86/kernel/cpu/mshyperv.c | 1 + arch/x86/kernel/setup.c | 1 + arch/x86/purgatory/purgatory.c | 2 + arch/x86/xen/enlighten.c | 1 + arch/xtensa/platforms/iss/setup.c | 1 + drivers/bus/brcmstb_gisb.c | 1 + drivers/char/ipmi/ipmi_msghandler.c | 1 + drivers/clk/analogbits/wrpll-cln28hpc.c | 4 + drivers/edac/altera_edac.c | 1 + drivers/firmware/google/gsmi.c | 1 + drivers/hv/vmbus_drv.c | 1 + drivers/hwtracing/coresight/coresight-cpu-debug.c | 1 + drivers/leds/trigger/ledtrig-activity.c | 1 + drivers/leds/trigger/ledtrig-heartbeat.c | 1 + drivers/leds/trigger/ledtrig-panic.c | 1 + drivers/misc/bcm-vk/bcm_vk_dev.c | 1 + drivers/misc/ibmasm/heartbeat.c | 1 + drivers/misc/pvpanic/pvpanic.c | 1 + drivers/net/ipa/ipa_smp2p.c | 1 + drivers/parisc/power.c | 1 + drivers/power/reset/ltc2952-poweroff.c | 1 + drivers/remoteproc/remoteproc_core.c | 1 + drivers/s390/char/con3215.c | 1 + drivers/s390/char/con3270.c | 1 + drivers/s390/char/sclp.c | 1 + drivers/s390/char/sclp_con.c | 1 + drivers/s390/char/sclp_vt220.c | 1 + drivers/s390/char/zcore.c | 1 + drivers/soc/bcm/brcmstb/pm/pm-arm.c | 1 + drivers/staging/olpc_dcon/olpc_dcon.c | 1 + drivers/video/fbdev/hyperv_fb.c | 1 + include/asm-generic/bug.h | 3 +- include/linux/kernel.h | 84 +------------------ include/linux/panic.h | 98 +++++++++++++++++++++++ include/linux/panic_notifier.h | 12 +++ include/linux/thread_info.h | 1 + kernel/hung_task.c | 1 + kernel/kexec_core.c | 1 + kernel/panic.c | 1 + kernel/rcu/tree.c | 2 + kernel/sysctl.c | 1 + kernel/trace/trace.c | 1 + 55 files changed, 169 insertions(+), 85 deletions(-) create mode 100644 include/linux/panic.h create mode 100644 include/linux/panic_notifier.h (limited to 'arch/x86') diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 5f6858e9dc28..7d56c217b235 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,6 @@ #include #include -extern struct atomic_notifier_head panic_notifier_list; static int alpha_panic_event(struct notifier_block *, unsigned long, void *); static struct notifier_block alpha_panic_block = { alpha_panic_event, diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 61845c0821d9..787bc0f601b3 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/ia64/include/asm/pal.h b/arch/ia64/include/asm/pal.h index 5c51fceedaf9..e6b652f9e45e 100644 --- a/arch/ia64/include/asm/pal.h +++ b/arch/ia64/include/asm/pal.h @@ -99,6 +99,7 @@ #include #include +#include /* * Data types needed to pass information into PAL procedures and diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c index 499a5357c09f..56b51de2dc51 100644 --- a/arch/mips/kernel/relocate.c +++ b/arch/mips/kernel/relocate.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c index c374f3ceec38..9028dbbb45dd 100644 --- a/arch/mips/sgi-ip22/ip22-reset.c +++ b/arch/mips/sgi-ip22/ip22-reset.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c index 20d8637340be..18d1c115cd53 100644 --- a/arch/mips/sgi-ip32/ip32-reset.c +++ b/arch/mips/sgi-ip32/ip32-reset.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/parisc/kernel/pdc_chassis.c b/arch/parisc/kernel/pdc_chassis.c index 75ae88d13909..da154406d368 100644 --- a/arch/parisc/kernel/pdc_chassis.c +++ b/arch/parisc/kernel/pdc_chassis.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 74a98fff2c2f..046fe21b5c3b 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -9,6 +9,7 @@ #undef DEBUG #include +#include #include #include #include diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index dba04fbc37a2..36f870dc944f 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/sparc/kernel/sstate.c b/arch/sparc/kernel/sstate.c index ac8677c3841e..3bcc4ddc6911 100644 --- a/arch/sparc/kernel/sstate.c +++ b/arch/sparc/kernel/sstate.c @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 6d00af25ec6b..328b16f99b30 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 74e07e748a9b..9512253947d5 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 476082a83d1c..ceb12683b6d1 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 22f13343b5da..9e5c6f2b044d 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1e720626069a..d21ba6fa39d9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index f03b64d9cb51..7558139920f8 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,6 +9,8 @@ */ #include +#include +#include #include #include diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index aa9f50fccc5d..c79bd0af2e8c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c index ed519aee0ec8..d3433e1bb94e 100644 --- a/arch/xtensa/platforms/iss/setup.c +++ b/arch/xtensa/platforms/iss/setup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/drivers/bus/brcmstb_gisb.c b/drivers/bus/brcmstb_gisb.c index 7355fa2cb439..6551286a60cc 100644 --- a/drivers/bus/brcmstb_gisb.c +++ b/drivers/bus/brcmstb_gisb.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 8a0e97b33cae..e96cb5c4f97a 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/drivers/clk/analogbits/wrpll-cln28hpc.c b/drivers/clk/analogbits/wrpll-cln28hpc.c index 776ead319ae9..7c64ea52a8d5 100644 --- a/drivers/clk/analogbits/wrpll-cln28hpc.c +++ b/drivers/clk/analogbits/wrpll-cln28hpc.c @@ -23,8 +23,12 @@ #include #include +#include #include #include +#include +#include + #include /* MIN_INPUT_FREQ: minimum input clock frequency, in Hz (Fref_min) */ diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 5f7fd79ec82f..61c21bd880a4 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index bb6e77ee3898..adaa492c3d2d 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 92cb3f7d21d9..57bbbaa4e8f7 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index 2dcf13de751f..9731d3a96073 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/leds/trigger/ledtrig-activity.c b/drivers/leds/trigger/ledtrig-activity.c index 14ba7faaed9e..30bc9df03636 100644 --- a/drivers/leds/trigger/ledtrig-activity.c +++ b/drivers/leds/trigger/ledtrig-activity.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/leds/trigger/ledtrig-heartbeat.c b/drivers/leds/trigger/ledtrig-heartbeat.c index 36b6709afe9f..7fe0a05574d2 100644 --- a/drivers/leds/trigger/ledtrig-heartbeat.c +++ b/drivers/leds/trigger/ledtrig-heartbeat.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/leds/trigger/ledtrig-panic.c b/drivers/leds/trigger/ledtrig-panic.c index 5751cd032f9d..64abf2e91608 100644 --- a/drivers/leds/trigger/ledtrig-panic.c +++ b/drivers/leds/trigger/ledtrig-panic.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "../leds.h" diff --git a/drivers/misc/bcm-vk/bcm_vk_dev.c b/drivers/misc/bcm-vk/bcm_vk_dev.c index 6bfea3210389..ad639ee85b2a 100644 --- a/drivers/misc/bcm-vk/bcm_vk_dev.c +++ b/drivers/misc/bcm-vk/bcm_vk_dev.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/ibmasm/heartbeat.c b/drivers/misc/ibmasm/heartbeat.c index 4f5f3bdc814d..59c9a0d95659 100644 --- a/drivers/misc/ibmasm/heartbeat.c +++ b/drivers/misc/ibmasm/heartbeat.c @@ -9,6 +9,7 @@ */ #include +#include #include "ibmasm.h" #include "dot_command.h" #include "lowlevel.h" diff --git a/drivers/misc/pvpanic/pvpanic.c b/drivers/misc/pvpanic/pvpanic.c index 65f70a4da8c0..793ea0c01193 100644 --- a/drivers/misc/pvpanic/pvpanic.c +++ b/drivers/misc/pvpanic/pvpanic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c index a5f7a79a1923..34b68dc43886 100644 --- a/drivers/net/ipa/ipa_smp2p.c +++ b/drivers/net/ipa/ipa_smp2p.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index ebaf6867b457..456776bd8ee6 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index d1495af30081..8688c8ba8894 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 626a6b90fba2..76dd8e2b1e7e 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c index 1fd5bca9fa20..02523f4e29f4 100644 --- a/drivers/s390/char/con3215.c +++ b/drivers/s390/char/con3215.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include /* ASYNC_* flags */ #include diff --git a/drivers/s390/char/con3270.c b/drivers/s390/char/con3270.c index e21962c0fd94..87cdbace1453 100644 --- a/drivers/s390/char/con3270.c +++ b/drivers/s390/char/con3270.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index 986bbbc23d0a..6627820a5eb9 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/char/sclp_con.c b/drivers/s390/char/sclp_con.c index 9b852a47ccc1..cc01a7b8595d 100644 --- a/drivers/s390/char/sclp_con.c +++ b/drivers/s390/char/sclp_con.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 7f4445b0f819..5b8a7b090a97 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index bd3c724bf695..b5b0848da93b 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/drivers/soc/bcm/brcmstb/pm/pm-arm.c b/drivers/soc/bcm/brcmstb/pm/pm-arm.c index a673fdffe216..3cbb165d6e30 100644 --- a/drivers/soc/bcm/brcmstb/pm/pm-arm.c +++ b/drivers/soc/bcm/brcmstb/pm/pm-arm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c index 6d8e9a481786..7284cb4ac395 100644 --- a/drivers/staging/olpc_dcon/olpc_dcon.c +++ b/drivers/staging/olpc_dcon/olpc_dcon.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index a7e6eea2c4a1..23999df52739 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index b402494883b6..f152b9bb916f 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -17,7 +17,8 @@ #endif #ifndef __ASSEMBLY__ -#include +#include +#include #ifdef CONFIG_BUG diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bf950621febf..baea2eb763d0 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +73,6 @@ #define lower_32_bits(n) ((u32)((n) & 0xffffffff)) struct completion; -struct pt_regs; struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY @@ -177,14 +177,6 @@ void __might_fault(const char *file, int line); static inline void might_fault(void) { } #endif -extern struct atomic_notifier_head panic_notifier_list; -extern long (*panic_blink)(int state); -__printf(1, 2) -void panic(const char *fmt, ...) __noreturn __cold; -void nmi_panic(struct pt_regs *regs, const char *msg); -extern void oops_enter(void); -extern void oops_exit(void); -extern bool oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; @@ -372,52 +364,8 @@ extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); -#ifdef CONFIG_SMP -extern unsigned int sysctl_oops_all_cpu_backtrace; -#else -#define sysctl_oops_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - extern void bust_spinlocks(int yes); -extern int panic_timeout; -extern unsigned long panic_print; -extern int panic_on_oops; -extern int panic_on_unrecovered_nmi; -extern int panic_on_io_nmi; -extern int panic_on_warn; -extern unsigned long panic_on_taint; -extern bool panic_on_taint_nousertaint; -extern int sysctl_panic_on_rcu_stall; -extern int sysctl_max_rcu_stall_to_panic; -extern int sysctl_panic_on_stackoverflow; - -extern bool crash_kexec_post_notifiers; -/* - * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It - * holds a CPU number which is executing panic() currently. A value of - * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). - */ -extern atomic_t panic_cpu; -#define PANIC_CPU_INVALID -1 - -/* - * Only to be used by arch init code. If the user over-wrote the default - * CONFIG_PANIC_TIMEOUT, honor it. - */ -static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) -{ - if (panic_timeout == arch_default_timeout) - panic_timeout = timeout; -} -extern const char *print_tainted(void); -enum lockdep_ok { - LOCKDEP_STILL_OK, - LOCKDEP_NOW_UNRELIABLE -}; -extern void add_taint(unsigned flag, enum lockdep_ok); -extern int test_taint(unsigned flag); -extern unsigned long get_taint(void); extern int root_mountflags; extern bool early_boot_irqs_disabled; @@ -436,36 +384,6 @@ extern enum system_states { SYSTEM_SUSPEND, } system_state; -/* This cannot be an enum because some may be used in assembly source. */ -#define TAINT_PROPRIETARY_MODULE 0 -#define TAINT_FORCED_MODULE 1 -#define TAINT_CPU_OUT_OF_SPEC 2 -#define TAINT_FORCED_RMMOD 3 -#define TAINT_MACHINE_CHECK 4 -#define TAINT_BAD_PAGE 5 -#define TAINT_USER 6 -#define TAINT_DIE 7 -#define TAINT_OVERRIDDEN_ACPI_TABLE 8 -#define TAINT_WARN 9 -#define TAINT_CRAP 10 -#define TAINT_FIRMWARE_WORKAROUND 11 -#define TAINT_OOT_MODULE 12 -#define TAINT_UNSIGNED_MODULE 13 -#define TAINT_SOFTLOCKUP 14 -#define TAINT_LIVEPATCH 15 -#define TAINT_AUX 16 -#define TAINT_RANDSTRUCT 17 -#define TAINT_FLAGS_COUNT 18 -#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) - -struct taint_flag { - char c_true; /* character printed when tainted */ - char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ -}; - -extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT]; - extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4] diff --git a/include/linux/panic.h b/include/linux/panic.h new file mode 100644 index 000000000000..f5844908a089 --- /dev/null +++ b/include/linux/panic.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PANIC_H +#define _LINUX_PANIC_H + +#include +#include + +struct pt_regs; + +extern long (*panic_blink)(int state); +__printf(1, 2) +void panic(const char *fmt, ...) __noreturn __cold; +void nmi_panic(struct pt_regs *regs, const char *msg); +extern void oops_enter(void); +extern void oops_exit(void); +extern bool oops_may_print(void); + +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + +extern int panic_timeout; +extern unsigned long panic_print; +extern int panic_on_oops; +extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; +extern int panic_on_warn; + +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; + +extern int sysctl_panic_on_rcu_stall; +extern int sysctl_max_rcu_stall_to_panic; +extern int sysctl_panic_on_stackoverflow; + +extern bool crash_kexec_post_notifiers; + +/* + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It + * holds a CPU number which is executing panic() currently. A value of + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). + */ +extern atomic_t panic_cpu; +#define PANIC_CPU_INVALID -1 + +/* + * Only to be used by arch init code. If the user over-wrote the default + * CONFIG_PANIC_TIMEOUT, honor it. + */ +static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) +{ + if (panic_timeout == arch_default_timeout) + panic_timeout = timeout; +} + +/* This cannot be an enum because some may be used in assembly source. */ +#define TAINT_PROPRIETARY_MODULE 0 +#define TAINT_FORCED_MODULE 1 +#define TAINT_CPU_OUT_OF_SPEC 2 +#define TAINT_FORCED_RMMOD 3 +#define TAINT_MACHINE_CHECK 4 +#define TAINT_BAD_PAGE 5 +#define TAINT_USER 6 +#define TAINT_DIE 7 +#define TAINT_OVERRIDDEN_ACPI_TABLE 8 +#define TAINT_WARN 9 +#define TAINT_CRAP 10 +#define TAINT_FIRMWARE_WORKAROUND 11 +#define TAINT_OOT_MODULE 12 +#define TAINT_UNSIGNED_MODULE 13 +#define TAINT_SOFTLOCKUP 14 +#define TAINT_LIVEPATCH 15 +#define TAINT_AUX 16 +#define TAINT_RANDSTRUCT 17 +#define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) + +struct taint_flag { + char c_true; /* character printed when tainted */ + char c_false; /* character printed when not tainted */ + bool module; /* also show as a per-module taint flag */ +}; + +extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT]; + +enum lockdep_ok { + LOCKDEP_STILL_OK, + LOCKDEP_NOW_UNRELIABLE, +}; + +extern const char *print_tainted(void); +extern void add_taint(unsigned flag, enum lockdep_ok); +extern int test_taint(unsigned flag); +extern unsigned long get_taint(void); + +#endif /* _LINUX_PANIC_H */ diff --git a/include/linux/panic_notifier.h b/include/linux/panic_notifier.h new file mode 100644 index 000000000000..41e32483d7a7 --- /dev/null +++ b/include/linux/panic_notifier.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PANIC_NOTIFIERS_H +#define _LINUX_PANIC_NOTIFIERS_H + +#include +#include + +extern struct atomic_notifier_head panic_notifier_list; + +extern bool crash_kexec_post_notifiers; + +#endif /* _LINUX_PANIC_NOTIFIERS_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 157762db9d4b..0999f6317978 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -9,6 +9,7 @@ #define _LINUX_THREAD_INFO_H #include +#include #include #include #include diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 396ebaebea3f..ebc641664eb0 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f099baee3578..4b34a9aa32bc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/panic.c b/kernel/panic.c index 332736a72a58..edad89660a2b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e78b2430c16..f12056beb916 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bb37739f5731..70e286c83d6b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d23a09d3eb37..3c1384bc5c5a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 66ce75144d4b33e376f187df3dec495fe47d2ad0 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 30 Jun 2021 18:56:31 -0700 Subject: kprobes: remove duplicated strong free_insn_page in x86 and s390 free_insn_page() in x86 and s390 is same with the common weak function in kernel/kprobes.c. Plus, the comment "Recover page to RW mode before releasing it" in x86 seems insensible to be there since resetting mapping is done by common code in vfree() of module_memfree(). So drop these two duplicated strong functions and related comment, then mark the common one in kernel/kprobes.c strong. Link: https://lkml.kernel.org/r/20210608065736.32656-1-song.bao.hua@hisilicon.com Signed-off-by: Barry Song Acked-by: Masami Hiramatsu Acked-by: Heiko Carstens Reviewed-by: Christoph Hellwig Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Qi Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/kprobes.c | 5 ----- arch/x86/kernel/kprobes/core.c | 6 ------ include/linux/kprobes.h | 1 - kernel/kprobes.c | 2 +- 4 files changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index aae24dc75df6..60cfbd24229b 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -44,11 +44,6 @@ void *alloc_insn_page(void) return page; } -void free_insn_page(void *page) -{ - module_memfree(page); -} - static void *alloc_s390_insn_page(void) { if (xchg(&insn_page_in_use, 1) == 1) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d3d65545cb8b..3bce67d3a03c 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -422,12 +422,6 @@ void *alloc_insn_page(void) return page; } -/* Recover page to RW mode before releasing it */ -void free_insn_page(void *page) -{ - module_memfree(page); -} - /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 1883a4a9f16a..c98a35a75f40 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -407,7 +407,6 @@ int enable_kprobe(struct kprobe *kp); void dump_kprobe(struct kprobe *kp); void *alloc_insn_page(void); -void free_insn_page(void *page); int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 745f08fdd7a6..e0c4c9d57299 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -106,7 +106,7 @@ void __weak *alloc_insn_page(void) return module_alloc(PAGE_SIZE); } -void __weak free_insn_page(void *page) +static void free_insn_page(void *page) { module_memfree(page); } -- cgit v1.2.3 From 87bf399f86ecf36cc84fbeb7027a2995af649d6e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 25 Jun 2021 21:32:47 +0800 Subject: perf/x86/cstate: Add ICELAKE_X and ICELAKE_D support Introduce icx_cstates for ICELAKE_X and ICELAKE_D, and also update the comments. On ICELAKE_X and ICELAKE_D, Core C1, Core C6, Package C2 and Package C6 Residency MSRs are supported. This patch has been tested on real hardware. Signed-off-by: Zhang Rui Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Acked-by: Artem Bityutskiy Link: https://lkml.kernel.org/r/20210625133247.2813-1-rui.zhang@intel.com --- arch/x86/events/intel/cstate.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 433399069e27..c6262b154c3a 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,7 +40,7 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL,TNT,ADL + * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -50,8 +50,8 @@ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL,ADL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, + * TGL,TNT,RKL,ADL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -61,7 +61,7 @@ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT,RKL,ADL + * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -72,8 +72,8 @@ * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT,RKL,ADL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, + * TGL,TNT,RKL,ADL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -566,6 +566,14 @@ static const struct cstate_model icl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model icx_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES), +}; + static const struct cstate_model adl_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES) | @@ -664,6 +672,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), -- cgit v1.2.3 From d4ba0b06306a70c99a43f9d452886a86e2d3bd26 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 24 Jun 2021 11:17:57 -0700 Subject: perf/x86/intel/uncore: Clean up error handling path of iio mapping The error handling path of iio mapping looks fragile. We already fixed one issue caused by it, commit f797f05d917f ("perf/x86/intel/uncore: Fix for iio mapping on Skylake Server"). Clean up the error handling path and make the code robust. Reported-by: gushengxian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/40e66cf9-398b-20d7-ce4d-433be6e08921@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 3a75a2c601c2..1f7bb4898a9d 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3789,11 +3789,11 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type) /* One more for NULL. */ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); if (!attrs) - goto err; + goto clear_topology; eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); if (!eas) - goto err; + goto clear_attrs; for (die = 0; die < uncore_max_dies(); die++) { sprintf(buf, "die%ld", die); @@ -3814,7 +3814,9 @@ err: for (; die >= 0; die--) kfree(eas[die].attr.attr.name); kfree(eas); +clear_attrs: kfree(attrs); +clear_topology: kfree(type->topology); clear_attr_update: type->attr_update = NULL; -- cgit v1.2.3 From 7bb7f2ac24a028b20fca466b9633847b289b156a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 7 Jul 2021 18:08:11 -0700 Subject: arch, mm: wire up memfd_secret system call where relevant Wire up memfd_secret system call on architectures that define ARCH_HAS_SET_DIRECT_MAP, namely arm64, risc-v and x86. Link: https://lkml.kernel.org/r/20210518072034.31572-7-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Palmer Dabbelt Acked-by: Arnd Bergmann Acked-by: Catalin Marinas Acked-by: David Hildenbrand Acked-by: James Bottomley Cc: Alexander Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Christopher Lameter Cc: Dan Williams Cc: Dave Hansen Cc: David Hildenbrand Cc: Elena Reshetova Cc: Hagen Paul Pfeifer Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Bottomley Cc: "Kirill A. Shutemov" Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael Kerrisk Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Rick Edgecombe Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Thomas Gleixner Cc: Tycho Andersen Cc: Will Deacon Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/uapi/asm/unistd.h | 1 + arch/riscv/include/asm/unistd.h | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 7 ++++++- scripts/checksyscalls.sh | 4 ++++ 7 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/uapi/asm/unistd.h b/arch/arm64/include/uapi/asm/unistd.h index f83a70e07df8..ce2ee8f1e361 100644 --- a/arch/arm64/include/uapi/asm/unistd.h +++ b/arch/arm64/include/uapi/asm/unistd.h @@ -20,5 +20,6 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS #define __ARCH_WANT_SYS_CLONE3 +#define __ARCH_WANT_MEMFD_SECRET #include diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 977ee6181dab..6c316093a1e5 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -9,6 +9,7 @@ */ #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_MEMFD_SECRET #include diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index fba2f615119a..ce763a12311c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -451,3 +451,4 @@ 444 i386 landlock_create_ruleset sys_landlock_create_ruleset 445 i386 landlock_add_rule sys_landlock_add_rule 446 i386 landlock_restrict_self sys_landlock_restrict_self +447 i386 memfd_secret sys_memfd_secret diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index af973e400053..f6b57799c1ea 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -368,6 +368,7 @@ 444 common landlock_create_ruleset sys_landlock_create_ruleset 445 common landlock_add_rule sys_landlock_add_rule 446 common landlock_restrict_self sys_landlock_restrict_self +447 common memfd_secret sys_memfd_secret # # Due to a historical design error, certain syscalls are numbered differently diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 586128d5c3b8..69c9a7010081 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1050,6 +1050,7 @@ asmlinkage long sys_landlock_create_ruleset(const struct landlock_ruleset_attr _ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type rule_type, const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); +asmlinkage long sys_memfd_secret(unsigned int flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f211961ce1da..a9d6fcd95f42 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -873,8 +873,13 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) #define __NR_landlock_restrict_self 446 __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) +#ifdef __ARCH_WANT_MEMFD_SECRET +#define __NR_memfd_secret 447 +__SYSCALL(__NR_memfd_secret, sys_memfd_secret) +#endif + #undef __NR_syscalls -#define __NR_syscalls 447 +#define __NR_syscalls 448 /* * 32 bit systems traditionally used different diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh index a18b47695f55..b7609958ee36 100755 --- a/scripts/checksyscalls.sh +++ b/scripts/checksyscalls.sh @@ -40,6 +40,10 @@ cat << EOF #define __IGNORE_setrlimit /* setrlimit */ #endif +#ifndef __ARCH_WANT_MEMFD_SECRET +#define __IGNORE_memfd_secret +#endif + /* Missing flags argument */ #define __IGNORE_renameat /* renameat2 */ -- cgit v1.2.3 From 30120d72a41e0e29c859bd8d41a2dd4d4aa29d4d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Jul 2021 18:09:03 -0700 Subject: x86: convert to setup_initial_init_mm() Use setup_initial_init_mm() helper to simplify code. Link: https://lkml.kernel.org/r/20210608083418.137226-16-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9f1d9215a9fb..bff3a784aec5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -847,10 +847,7 @@ void __init setup_arch(char **cmdline_p) if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = _brk_end; + setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; -- cgit v1.2.3 From 9ef8af2a8f25b16eec6d2865ca7d9116a24ad46a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 7 Jul 2021 18:09:27 -0700 Subject: x86/dumpstack: use %pSb/%pBb for backtrace printing Let's use the new printk formats to print the stacktrace entries when printing a backtrace to the kernel logs. This will include any module's build ID[1] in it so that offline/crash debugging can easily locate the debuginfo for a module via something like debuginfod[2]. Link: https://lkml.kernel.org/r/20210511003845.2429846-8-swboyd@chromium.org Link: https://fedoraproject.org/wiki/Releases/FeatureBuildId [1] Link: https://sourceware.org/elfutils/Debuginfod.html [2] Signed-off-by: Stephen Boyd Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Jiri Olsa Cc: Alexei Starovoitov Cc: Jessica Yu Cc: Evan Green Cc: Hsin-Yi Wang Cc: Petr Mladek Cc: Steven Rostedt Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Baoquan He Cc: Catalin Marinas Cc: Dave Young Cc: Konstantin Khlebnikov Cc: Rasmus Villemoes Cc: Sasha Levin Cc: Sergey Senozhatsky Cc: Vivek Goyal Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/dumpstack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 299c20f0a38b..ea4fe192189d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -69,7 +69,7 @@ static void printk_stack_address(unsigned long address, int reliable, const char *log_lvl) { touch_nmi_watchdog(); - printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); + printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address); } static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, -- cgit v1.2.3 From 9cf6fa2458443118b84090aa1bf7a3630b5940e8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 7 Jul 2021 18:09:53 -0700 Subject: mm: rename pud_page_vaddr to pud_pgtable and make it return pmd_t * No functional change in this patch. [aneesh.kumar@linux.ibm.com: fix] Link: https://lkml.kernel.org/r/87wnqtnb60.fsf@linux.ibm.com [sfr@canb.auug.org.au: another fix] Link: https://lkml.kernel.org/r/20210619134410.89559-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20210615110859.320299-1-aneesh.kumar@linux.ibm.com Link: https://lore.kernel.org/linuxppc-dev/CAHk-=wi+J+iodze9FtjM3Zi4j4OeS+qqbKxME9QN4roxPEXH9Q@mail.gmail.com/ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Stephen Rothwell Cc: Christophe Leroy Cc: Hugh Dickins Cc: Joel Fernandes Cc: Kalesh Singh Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 8 +++++--- arch/arm/include/asm/pgtable-3level.h | 2 +- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/ia64/include/asm/pgtable.h | 2 +- arch/m68k/include/asm/motorola_pgtable.h | 2 +- arch/mips/include/asm/pgtable-64.h | 4 ++-- arch/parisc/include/asm/pgtable.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +++++- arch/powerpc/include/asm/nohash/64/pgtable.h | 6 +++++- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++-- arch/powerpc/mm/pgtable_64.c | 2 +- arch/riscv/include/asm/pgtable-64.h | 4 ++-- arch/sh/include/asm/pgtable-3level.h | 4 ++-- arch/sparc/include/asm/pgtable_32.h | 6 +++--- arch/sparc/include/asm/pgtable_64.h | 6 +++--- arch/um/include/asm/pgtable-3level.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/pat/set_memory.c | 4 ++-- arch/x86/mm/pgtable.c | 2 +- include/asm-generic/pgtable-nopmd.h | 2 +- include/asm-generic/pgtable-nopud.h | 2 +- include/linux/pgtable.h | 2 +- 22 files changed, 46 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index ff690846465e..02f0429f1068 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -236,8 +236,10 @@ pmd_page_vaddr(pmd_t pmd) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> 32)) #define pud_page(pud) (pfn_to_page(pud_val(pud) >> 32)) -extern inline unsigned long pud_page_vaddr(pud_t pgd) -{ return PAGE_OFFSET + ((pud_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +extern inline pmd_t *pud_pgtable(pud_t pgd) +{ + return (pmd_t *)(PAGE_OFFSET + ((pud_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT))); +} extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } @@ -287,7 +289,7 @@ extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; retu /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address) { - pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + pmd_t *ret = pud_pgtable(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); smp_rmb(); /* see above */ return ret; } diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index d4edab51a77c..eabe72ff7381 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -130,7 +130,7 @@ flush_pmd_entry(pudp); \ } while (0) -static inline pmd_t *pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { return __va(pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK); } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 508c7ffad515..d6b8e74f3018 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -649,9 +649,9 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) return __pud_to_phys(pud); } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return (unsigned long)__va(pud_page_paddr(pud)); + return (pmd_t *)__va(pud_page_paddr(pud)); } /* Find an entry in the second-level page table. */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 3f5dbbd8b9d8..7599d04ae192 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -273,7 +273,7 @@ ia64_phys_addr_valid (unsigned long addr) #define pud_bad(pud) (!ia64_phys_addr_valid(pud_val(pud))) #define pud_present(pud) (pud_val(pud) != 0UL) #define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & _PFN_MASK)) +#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & _PFN_MASK)) #define pud_page(pud) virt_to_page((pud_val(pud) + PAGE_OFFSET)) #if CONFIG_PGTABLE_LEVELS == 4 diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index a2908164ee6f..022c3abc280d 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -131,7 +131,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define pmd_page_vaddr(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) -#define pud_page_vaddr(pud) ((unsigned long)__va(pud_val(pud) & _TABLE_MASK)) +#define pud_pgtable(pud) ((pmd_t *)__va(pud_val(pud) & _TABLE_MASK)) #define pte_none(pte) (!pte_val(pte)) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 046465906c82..1136b6f3e41e 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -313,9 +313,9 @@ static inline void pud_clear(pud_t *pudp) #endif #ifndef __PAGETABLE_PMD_FOLDED -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return pud_val(pud); + return (pmd_t *)pud_val(pud); } #define pud_phys(pud) virt_to_phys((void *)pud_val(pud)) #define pud_page(pud) (pfn_to_page(pud_phys(pud) >> PAGE_SHIFT)) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 7f33c29764cc..43937af127b1 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -322,8 +322,8 @@ static inline void pmd_clear(pmd_t *pmd) { #if CONFIG_PGTABLE_LEVELS == 3 -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_address(pud))) -#define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) +#define pud_pgtable(pud) ((pmd_t *) __va(pud_address(pud))) +#define pud_page(pud) virt_to_page((void *)pud_pgtable(pud)) /* For 64 bit we have three level tables */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4d9941b2fe51..536037017c3e 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1051,9 +1051,13 @@ extern struct page *p4d_page(p4d_t p4d); /* Pointers in the page table tree are physical addresses */ #define __pgtable_ptr_val(ptr) __pa(ptr) -#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) #define p4d_page_vaddr(p4d) __va(p4d_val(p4d) & ~P4D_MASKED_BITS) +static inline pmd_t *pud_pgtable(pud_t pud) +{ + return (pmd_t *)__va(pud_val(pud) & ~PUD_MASKED_BITS); +} + #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 53fbfdfac93d..d081704b13fb 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -162,7 +162,11 @@ static inline void pud_clear(pud_t *pudp) #define pud_bad(pud) (!is_kernel_addr(pud_val(pud)) \ || (pud_val(pud) & PUD_BAD_BITS)) #define pud_present(pud) (pud_val(pud) != 0) -#define pud_page_vaddr(pud) (pud_val(pud) & ~PUD_MASKED_BITS) + +static inline pmd_t *pud_pgtable(pud_t pud) +{ + return (pmd_t *)(pud_val(pud) & ~PUD_MASKED_BITS); +} extern struct page *pud_page(pud_t pud); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 2176a5f70746..54f135b1ee60 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -820,7 +820,7 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, continue; } - pmd_base = (pmd_t *)pud_page_vaddr(*pud); + pmd_base = pud_pgtable(*pud); remove_pmd_table(pmd_base, addr, next); free_pmd_table(pmd_base, pud); } @@ -1105,7 +1105,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) pmd_t *pmd; int i; - pmd = (pmd_t *)pud_page_vaddr(*pud); + pmd = pud_pgtable(*pud); pud_clear(pud); flush_tlb_kernel_range(addr, addr + PUD_SIZE); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index cc6e2f94517f..4ba311808bdb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -115,7 +115,7 @@ struct page *pud_page(pud_t pud) VM_WARN_ON(!pud_huge(pud)); return pte_page(pud_pte(pud)); } - return virt_to_page(pud_page_vaddr(pud)); + return virt_to_page(pud_pgtable(pud)); } /* diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index f3b0da64c6c8..0e863f3f7187 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -60,9 +60,9 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return (unsigned long)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); + return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); } static inline struct page *pud_page(pud_t pud) diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 82d74472dfcd..56bf35c2f29c 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -32,9 +32,9 @@ typedef struct { unsigned long long pmd; } pmd_t; #define pmd_val(x) ((x).pmd) #define __pmd(x) ((pmd_t) { (x) } ) -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return pud_val(pud); + return (pmd_t *)pud_val(pud); } /* only used by the stubbed out hugetlb gup code, should never be called */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index ebaf374b55ab..ffccfe3b22ed 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -151,13 +151,13 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) return (unsigned long)__nocache_va(v << 4); } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { if (srmmu_device_memory(pud_val(pud))) { - return ~0; + return (pmd_t *)~0; } else { unsigned long v = pud_val(pud) & SRMMU_PTD_PMASK; - return (unsigned long)__nocache_va(v << 4); + return (pmd_t *)__nocache_va(v << 4); } } diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index e0ee48ec3903..23da25d4765c 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -841,18 +841,18 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) return ((unsigned long) __va(pfn << PAGE_SHIFT)); } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { pte_t pte = __pte(pud_val(pud)); unsigned long pfn; pfn = pte_pfn(pte); - return ((unsigned long) __va(pfn << PAGE_SHIFT)); + return ((pmd_t *) __va(pfn << PAGE_SHIFT)); } #define pmd_page(pmd) virt_to_page((void *)pmd_page_vaddr(pmd)) -#define pud_page(pud) virt_to_page((void *)pud_page_vaddr(pud)) +#define pud_page(pud) virt_to_page((void *)pud_pgtable(pud)) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0UL) #define pud_present(pud) (pud_val(pud) != 0U) #define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 9289a86643a9..cb896e6121c8 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h @@ -83,7 +83,7 @@ static inline void pud_clear (pud_t *pud) } #define pud_page(pud) phys_to_page(pud_val(pud) & PAGE_MASK) -#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PAGE_MASK)) +#define pud_pgtable(pud) ((pmd_t *) __va(pud_val(pud) & PAGE_MASK)) static inline unsigned long pte_pfn(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ec0d8e106646..e064c5fef0f3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -836,9 +836,9 @@ static inline int pud_present(pud_t pud) return pud_flags(pud) & _PAGE_PRESENT; } -static inline unsigned long pud_page_vaddr(pud_t pud) +static inline pmd_t *pud_pgtable(pud_t pud) { - return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud)); + return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud)); } /* diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 156cd235659f..ad8a5c586a35 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1134,7 +1134,7 @@ static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { if (unmap_pte_range(pmd, start, end)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } @@ -1178,7 +1178,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) + if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1303ff6ef7be..3364fe62b903 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -801,7 +801,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) pte_t *pte; int i; - pmd = (pmd_t *)pud_page_vaddr(*pud); + pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 3e13acd019ae..10789cf51d16 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -51,7 +51,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) #define __pmd(x) ((pmd_t) { __pud(x) } ) #define pud_page(pud) (pmd_page((pmd_t){ pud })) -#define pud_page_vaddr(pud) (pmd_page_vaddr((pmd_t){ pud })) +#define pud_pgtable(pud) ((pmd_t *)(pmd_page_vaddr((pmd_t){ pud }))) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index a9d751fbda9e..7cbd15f70bf5 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -49,7 +49,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) #define __pud(x) ((pud_t) { __p4d(x) }) #define p4d_page(p4d) (pud_page((pud_t){ p4d })) -#define p4d_page_vaddr(p4d) (pud_page_vaddr((pud_t){ p4d })) +#define p4d_page_vaddr(p4d) (pud_pgtable((pud_t){ p4d })) /* * allocating and freeing a pud is trivial: the 1-entry pud is diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e82660f7b9e4..c7c992ada1fe 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -106,7 +106,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { - return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); + return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif -- cgit v1.2.3 From dc4875f0e791de554bdc45aa1dbd6e45e107e50f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 7 Jul 2021 18:09:56 -0700 Subject: mm: rename p4d_page_vaddr to p4d_pgtable and make it return pud_t * No functional change in this patch. [aneesh.kumar@linux.ibm.com: m68k build error reported by kernel robot] Link: https://lkml.kernel.org/r/87tulxnb2v.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20210615110859.320299-2-aneesh.kumar@linux.ibm.com Link: https://lore.kernel.org/linuxppc-dev/CAHk-=wi+J+iodze9FtjM3Zi4j4OeS+qqbKxME9QN4roxPEXH9Q@mail.gmail.com/ Signed-off-by: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Hugh Dickins Cc: Joel Fernandes Cc: Kalesh Singh Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/ia64/include/asm/pgtable.h | 2 +- arch/mips/include/asm/pgtable-64.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgtable.h | 5 ++++- arch/powerpc/include/asm/nohash/64/pgtable-4k.h | 6 +++++- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/sparc/include/asm/pgtable_64.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/asm-generic/pgtable-nop4d.h | 2 +- include/asm-generic/pgtable-nopud.h | 2 +- include/linux/pgtable.h | 2 +- 13 files changed, 25 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d6b8e74f3018..f09bf5c02891 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -710,9 +710,9 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d) return __p4d_to_phys(p4d); } -static inline unsigned long p4d_page_vaddr(p4d_t p4d) +static inline pud_t *p4d_pgtable(p4d_t p4d) { - return (unsigned long)__va(p4d_page_paddr(p4d)); + return (pud_t *)__va(p4d_page_paddr(p4d)); } /* Find an entry in the frst-level page table. */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 7599d04ae192..9584b2c5f394 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -281,7 +281,7 @@ ia64_phys_addr_valid (unsigned long addr) #define p4d_bad(p4d) (!ia64_phys_addr_valid(p4d_val(p4d))) #define p4d_present(p4d) (p4d_val(p4d) != 0UL) #define p4d_clear(p4dp) (p4d_val(*(p4dp)) = 0UL) -#define p4d_page_vaddr(p4d) ((unsigned long) __va(p4d_val(p4d) & _PFN_MASK)) +#define p4d_pgtable(p4d) ((pud_t *) __va(p4d_val(p4d) & _PFN_MASK)) #define p4d_page(p4d) virt_to_page((p4d_val(p4d) + PAGE_OFFSET)) #endif diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 1136b6f3e41e..41921acdc9d8 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -209,9 +209,9 @@ static inline void p4d_clear(p4d_t *p4dp) p4d_val(*p4dp) = (unsigned long)invalid_pud_table; } -static inline unsigned long p4d_page_vaddr(p4d_t p4d) +static inline pud_t *p4d_pgtable(p4d_t p4d) { - return p4d_val(p4d); + return (pud_t *)p4d_val(p4d); } #define p4d_phys(p4d) virt_to_phys((void *)p4d_val(p4d)) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 536037017c3e..5d34a8646f08 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1051,7 +1051,10 @@ extern struct page *p4d_page(p4d_t p4d); /* Pointers in the page table tree are physical addresses */ #define __pgtable_ptr_val(ptr) __pa(ptr) -#define p4d_page_vaddr(p4d) __va(p4d_val(p4d) & ~P4D_MASKED_BITS) +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + return (pud_t *)__va(p4d_val(p4d) & ~P4D_MASKED_BITS); +} static inline pmd_t *pud_pgtable(pud_t pud) { diff --git a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h index fe2f4c9acd9e..10f5cf444d72 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable-4k.h @@ -56,10 +56,14 @@ #define p4d_none(p4d) (!p4d_val(p4d)) #define p4d_bad(p4d) (p4d_val(p4d) == 0) #define p4d_present(p4d) (p4d_val(p4d) != 0) -#define p4d_page_vaddr(p4d) (p4d_val(p4d) & ~P4D_MASKED_BITS) #ifndef __ASSEMBLY__ +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + return (pud_t *) (p4d_val(p4d) & ~P4D_MASKED_BITS); +} + static inline void p4d_clear(p4d_t *p4dp) { *p4dp = __p4d(0); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 54f135b1ee60..e50ddf129c15 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -854,7 +854,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end) continue; } - pud_base = (pud_t *)p4d_page_vaddr(*p4d); + pud_base = p4d_pgtable(*p4d); remove_pud_table(pud_base, addr, next); free_pud_table(pud_base, p4d); } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 4ba311808bdb..78c8cf01db5f 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -105,7 +105,7 @@ struct page *p4d_page(p4d_t p4d) VM_WARN_ON(!p4d_huge(p4d)); return pte_page(p4d_pte(p4d)); } - return virt_to_page(p4d_page_vaddr(p4d)); + return virt_to_page(p4d_pgtable(p4d)); } #endif diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 23da25d4765c..4679e45c8348 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -856,8 +856,8 @@ static inline pmd_t *pud_pgtable(pud_t pud) #define pmd_clear(pmdp) (pmd_val(*(pmdp)) = 0UL) #define pud_present(pud) (pud_val(pud) != 0U) #define pud_clear(pudp) (pud_val(*(pudp)) = 0UL) -#define p4d_page_vaddr(p4d) \ - ((unsigned long) __va(p4d_val(p4d))) +#define p4d_pgtable(p4d) \ + ((pud_t *) __va(p4d_val(p4d))) #define p4d_present(p4d) (p4d_val(p4d) != 0U) #define p4d_clear(p4dp) (p4d_val(*(p4dp)) = 0UL) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e064c5fef0f3..448cd01eb3ec 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -877,9 +877,9 @@ static inline int p4d_present(p4d_t p4d) return p4d_flags(p4d) & _PAGE_PRESENT; } -static inline unsigned long p4d_page_vaddr(p4d_t p4d) +static inline pud_t *p4d_pgtable(p4d_t p4d) { - return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); + return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 65ea58527176..ddeaba947eb3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -194,8 +194,8 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) spin_lock(pgt_lock); if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) - BUG_ON(p4d_page_vaddr(*p4d) - != p4d_page_vaddr(*p4d_ref)); + BUG_ON(p4d_pgtable(*p4d) + != p4d_pgtable(*p4d_ref)); if (p4d_none(*p4d)) set_p4d(p4d, *p4d_ref); diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 2f6b1befb129..03b7dae47dd4 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -41,7 +41,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) #define __p4d(x) ((p4d_t) { __pgd(x) }) #define pgd_page(pgd) (p4d_page((p4d_t){ pgd })) -#define pgd_page_vaddr(pgd) (p4d_page_vaddr((p4d_t){ pgd })) +#define pgd_page_vaddr(pgd) ((unsigned long)(p4d_pgtable((p4d_t){ pgd }))) /* * allocating and freeing a p4d is trivial: the 1-entry p4d is diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index 7cbd15f70bf5..eb70c6d7ceff 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -49,7 +49,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) #define __pud(x) ((pud_t) { __p4d(x) }) #define p4d_page(p4d) (pud_page((pud_t){ p4d })) -#define p4d_page_vaddr(p4d) (pud_pgtable((pud_t){ p4d })) +#define p4d_pgtable(p4d) ((pud_t *)(pud_pgtable((pud_t){ p4d }))) /* * allocating and freeing a pud is trivial: the 1-entry pud is diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c7c992ada1fe..d147480cdefc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -114,7 +114,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { - return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); + return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif -- cgit v1.2.3 From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 7 Jul 2021 15:38:47 -0700 Subject: bpf: Track subprog poke descriptors correctly and fix use-after-free Subprograms are calling map_poke_track(), but on program release there is no hook to call map_poke_untrack(). However, on program release, the aux memory (and poke descriptor table) is freed even though we still have a reference to it in the element list of the map aux data. When we run map_poke_run(), we then end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run(): [...] [ 402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e [ 402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337 [ 402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G I 5.12.0+ #399 [ 402.824715] Call Trace: [ 402.824719] dump_stack+0x93/0xc2 [ 402.824727] print_address_description.constprop.0+0x1a/0x140 [ 402.824736] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824740] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824744] kasan_report.cold+0x7c/0xd8 [ 402.824752] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824757] prog_array_map_poke_run+0xc2/0x34e [ 402.824765] bpf_fd_array_map_update_elem+0x124/0x1a0 [...] The elements concerned are walked as follows: for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; [...] The access to size_poke_tab is a 4 byte read, verified by checking offsets in the KASAN dump: [ 402.825004] The buggy address belongs to the object at ffff8881905a7800 which belongs to the cache kmalloc-1k of size 1024 [ 402.825008] The buggy address is located 320 bytes inside of 1024-byte region [ffff8881905a7800, ffff8881905a7c00) The pahole output of bpf_prog_aux: struct bpf_prog_aux { [...] /* --- cacheline 5 boundary (320 bytes) --- */ u32 size_poke_tab; /* 320 4 */ [...] In general, subprograms do not necessarily manage their own data structures. For example, BTF func_info and linfo are just pointers to the main program structure. This allows reference counting and cleanup to be done on the latter which simplifies their management a bit. The aux->poke_tab struct, however, did not follow this logic. The initial proposed fix for this use-after-free bug further embedded poke data tracking into the subprogram with proper reference counting. However, Daniel and Alexei questioned why we were treating these objects special; I agree, its unnecessary. The fix here removes the per subprogram poke table allocation and map tracking and instead simply points the aux->poke_tab pointer at the main programs poke table. This way, map tracking is simplified to the main program and we do not need to manage them per subprogram. This also means, bpf_prog_free_deferred(), which unwinds the program reference counting and kfrees objects, needs to ensure that we don't try to double free the poke_tab when free'ing the subprog structures. This is easily solved by NULL'ing the poke_tab pointer. The second detail is to ensure that per subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do this, we add a pointer in the poke structure to point at the subprogram value so JITs can easily check while walking the poke_tab structure if the current entry belongs to the current program. The aux pointer is stable and therefore suitable for such comparison. On the jit_subprogs() error path, we omit cleaning up the poke->aux field because these are only ever referenced from the JIT side, but on error we will never make it to the JIT, so its fine to leave them dangling. Removing these pointers would complicate the error path for no reason. However, we do need to untrack all poke descriptors from the main program as otherwise they could race with the freeing of JIT memory from the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") had an off-by-one on the subprogram instruction index range check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'. However, subprog_end is the next subprogram's start instruction. Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 1 + kernel/bpf/core.c | 8 +++++- kernel/bpf/verifier.c | 60 ++++++++++++++++----------------------------- 4 files changed, 32 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e835164189f1..4b951458c9fc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; + if (poke->aux && poke->aux != prog->aux) + continue; + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..e8e2b0393ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; + void *aux; union { struct { struct bpf_map *map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 034ad93a1ad7..9b1577498373 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) + for (i = 0; i < aux->func_cnt; i++) { + /* We can just unlink the subprog poke descriptor table as + * it was originally linked to the main program and is also + * released along with it. + */ + aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); + } if (aux->func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index be38bb930bf1..42a4063de7cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; func[i]->is_func = 1; func[i]->aux->func_idx = i; - /* the btf and func_info will be freed only at prog->aux */ + /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->poke_tab = prog->aux->poke_tab; + func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; for (j = 0; j < prog->aux->size_poke_tab; j++) { - u32 insn_idx = prog->aux->poke_tab[j].insn_idx; - int ret; + struct bpf_jit_poke_descriptor *poke; - if (!(insn_idx >= subprog_start && - insn_idx <= subprog_end)) - continue; - - ret = bpf_jit_add_poke_descriptor(func[i], - &prog->aux->poke_tab[j]); - if (ret < 0) { - verbose(env, "adding tail call poke descriptor failed\n"); - goto out_free; - } - - func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; - - map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; - ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); - if (ret < 0) { - verbose(env, "tracking tail call prog failed\n"); - goto out_free; - } + poke = &prog->aux->poke_tab[j]; + if (poke->insn_idx < subprog_end && + poke->insn_idx >= subprog_start) + poke->aux = func[i]->aux; } /* Use bpf_prog_F_tag to indicate functions in stack traces. @@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } - /* Untrack main program's aux structs so that during map_poke_run() - * we will not stumble upon the unfilled poke descriptors; each - * of the main program's poke descs got distributed across subprogs - * and got tracked onto map, so we are sure that none of them will - * be missed after the operation below - */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - - map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); - } - /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_jit_attempt_done(prog); return 0; out_free: + /* We failed JIT'ing, so at this point we need to unregister poke + * descriptors from subprogs, so that kernel is not attempting to + * patch it anymore as we're freeing the subprog JIT memory. + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* At this point we're guaranteed that poke descriptors are not + * live anymore. We can just unlink its descriptor table as it's + * released with the main prog. + */ for (i = 0; i < env->subprog_cnt; i++) { if (!func[i]) continue; - - for (j = 0; j < func[i]->aux->size_poke_tab; j++) { - map_ptr = func[i]->aux->poke_tab[j].tail_call.map; - map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); - } + func[i]->aux->poke_tab = NULL; bpf_jit_free(func[i]); } kfree(func); -- cgit v1.2.3 From e48a12e546ecbfb0718176037eae0ad60598a29a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Jul 2021 06:16:05 +0200 Subject: jump_labels: Mark __jump_label_transform() as __always_inlined to work around aggressive compiler un-inlining In randconfig testing, certain UBSAN and CC Kconfig combinations with GCC 10.3.0: CONFIG_X86_32=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_UBSAN=y # CONFIG_UBSAN_TRAP is not set # CONFIG_UBSAN_BOUNDS is not set CONFIG_UBSAN_SHIFT=y # CONFIG_UBSAN_DIV_ZERO is not set CONFIG_UBSAN_UNREACHABLE=y CONFIG_UBSAN_BOOL=y # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set # CONFIG_UBSAN_SANITIZE_ALL is not set ... produce this build warning (and build error if CONFIG_SECTION_MISMATCH_WARN_ONLY=y is set): WARNING: modpost: vmlinux.o(.text+0x4c1cc): Section mismatch in reference from the function __jump_label_transform() to the function .init.text:text_poke_early() The function __jump_label_transform() references the function __init text_poke_early(). This is often because __jump_label_transform lacks a __init annotation or the annotation of text_poke_early is wrong. ERROR: modpost: Section mismatches detected. The problem is that __jump_label_transform() gets uninlined by GCC, despite there being only a single local scope user of the 'static inline' function. Mark the function __always_inline instead, to work around this compiler bug/artifact. Signed-off-by: Ingo Molnar --- arch/x86/kernel/jump_label.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 674906fad43b..68f091ba8443 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -79,9 +79,10 @@ __jump_label_patch(struct jump_entry *entry, enum jump_label_type type) return (struct jump_label_patch){.code = code, .size = size}; } -static inline void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static __always_inline void +__jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) { const struct jump_label_patch jlp = __jump_label_patch(entry, type); -- cgit v1.2.3 From f0414b078dd11641a7a64027c2741396f47718fd Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Jun 2021 17:18:53 -0700 Subject: Revert "KVM: x86: WARN and reject loading KVM if NX is supported but not enabled" Let KVM load if EFER.NX=0 even if NX is supported, the analysis and testing (or lack thereof) for the non-PAE host case was garbage. If the kernel won't be using PAE paging, .Ldefault_entry in head_32.S skips over the entire EFER sequence. Hopefully that can be changed in the future to allow KVM to require EFER.NX, but the motivation behind KVM's requirement isn't yet merged. Reverting and revisiting the mess at a later date is by far the safest approach. This reverts commit 8bbed95d2cb6e5de8a342d761a89b0a04faed7be. Fixes: 8bbed95d2cb6 ("KVM: x86: WARN and reject loading KVM if NX is supported but not enabled") Signed-off-by: Sean Christopherson Message-Id: <20210625001853.318148-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 17468d983fbd..cf10d196ca65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10981,9 +10981,6 @@ int kvm_arch_hardware_setup(void *opaque) int r; rdmsrl_safe(MSR_EFER, &host_efer); - if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) && - !(host_efer & EFER_NX))) - return -EIO; if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); -- cgit v1.2.3 From 4bf48e3c0aafd32b960d341c4925b48f416f14a5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jun 2021 16:05:46 -0700 Subject: KVM: x86: Use guest MAXPHYADDR from CPUID.0x8000_0008 iff TDP is enabled Ignore the guest MAXPHYADDR reported by CPUID.0x8000_0008 if TDP, i.e. NPT, is disabled, and instead use the host's MAXPHYADDR. Per AMD'S APM: Maximum guest physical address size in bits. This number applies only to guests using nested paging. When this field is zero, refer to the PhysAddrSize field for the maximum guest physical address size. Fixes: 24c82e576b78 ("KVM: Sanitize cpuid") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210623230552.4027702-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c42613cfb5ba..1a4217b3e185 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -940,8 +940,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); unsigned phys_as = entry->eax & 0xff; - if (!g_phys_as) + /* + * Use bare metal's MAXPHADDR if the CPU doesn't report guest + * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the + * guest version "applies only to guests using nested paging". + */ + if (!g_phys_as || !tdp_enabled) g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); -- cgit v1.2.3 From e39f00f60ebd2e7b295c37a05e6349df656d3eb8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jun 2021 16:05:47 -0700 Subject: KVM: x86: Use kernel's x86_phys_bits to handle reduced MAXPHYADDR Use boot_cpu_data.x86_phys_bits instead of the raw CPUID information to enumerate the MAXPHYADDR for KVM guests when TDP is disabled (the guest version is only relevant to NPT/TDP). When using shadow paging, any reductions to the host's MAXPHYADDR apply to KVM and its guests as well, i.e. using the raw CPUID info will cause KVM to misreport the number of PA bits available to the guest. Unconditionally zero out the "Physical Address bit reduction" entry. For !TDP, the adjustment is already done, and for TDP enumerating the host's reduction is wrong as the reduction does not apply to GPAs. Fixes: 9af9b94068fb ("x86/cpu/AMD: Handle SME reduction in physical address size") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210623230552.4027702-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1a4217b3e185..ca7866d63e98 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -941,11 +941,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) unsigned phys_as = entry->eax & 0xff; /* - * Use bare metal's MAXPHADDR if the CPU doesn't report guest - * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the - * guest version "applies only to guests using nested paging". + * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as + * the guest operates in the same PA space as the host, i.e. + * reductions in MAXPHYADDR for memory encryption affect shadow + * paging, too. + * + * If TDP is enabled but an explicit guest MAXPHYADDR is not + * provided, use the raw bare metal MAXPHYADDR as reductions to + * the HPAs do not affect GPAs. */ - if (!g_phys_as || !tdp_enabled) + if (!tdp_enabled) + g_phys_as = boot_cpu_data.x86_phys_bits; + else if (!g_phys_as) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); @@ -970,12 +977,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 0x8000001a: case 0x8000001e: break; - /* Support memory encryption cpuid if host supports it */ case 0x8000001F: - if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) + if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; - else + } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); + + /* + * Enumerate '0' for "PA bits reduction", the adjusted + * MAXPHYADDR is enumerated directly (see 0x80000008). + */ + entry->ebx &= ~GENMASK(11, 6); + } break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: -- cgit v1.2.3 From fc9bf2e087efcd81bda2e52d09616d2a1bf982a8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 23 Jun 2021 16:05:49 -0700 Subject: KVM: x86/mmu: Do not apply HPA (memory encryption) mask to GPAs Ignore "dynamic" host adjustments to the physical address mask when generating the masks for guest PTEs, i.e. the guest PA masks. The host physical address space and guest physical address space are two different beasts, e.g. even though SEV's C-bit is the same bit location for both host and guest, disabling SME in the host (which clears shadow_me_mask) does not affect the guest PTE->GPA "translation". For non-SEV guests, not dropping bits is the correct behavior. Assuming KVM and userspace correctly enumerate/configure guest MAXPHYADDR, bits that are lost as collateral damage from memory encryption are treated as reserved bits, i.e. KVM will never get to the point where it attempts to generate a gfn using the affected bits. And if userspace wants to create a bogus vCPU, then userspace gets to deal with the fallout of hardware doing odd things with bad GPAs. For SEV guests, not dropping the C-bit is technically wrong, but it's a moot point because KVM can't read SEV guest's page tables in any case since they're always encrypted. Not to mention that the current KVM code is also broken since sme_me_mask does not have to be non-zero for SEV to be supported by KVM. The proper fix would be to teach all of KVM to correctly handle guest private memory, but that's a task for the future. Fixes: d0ec49d4de90 ("kvm/x86/svm: Support Secure Memory Encryption within KVM") Cc: stable@vger.kernel.org Cc: Brijesh Singh Cc: Tom Lendacky Signed-off-by: Sean Christopherson Message-Id: <20210623230552.4027702-5-seanjc@google.com> [Use a new header instead of adding header guards to paging_tmpl.h. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 ++ arch/x86/kvm/mmu/paging.h | 14 ++++++++++++++ arch/x86/kvm/mmu/paging_tmpl.h | 4 ++-- arch/x86/kvm/mmu/spte.h | 6 ------ 4 files changed, 18 insertions(+), 8 deletions(-) create mode 100644 arch/x86/kvm/mmu/paging.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 00732757cc60..b888385d1933 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,6 +53,8 @@ #include #include "trace.h" +#include "paging.h" + extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h new file mode 100644 index 000000000000..de8ab323bb70 --- /dev/null +++ b/arch/x86/kvm/mmu/paging.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Shadow paging constants/helpers that don't need to be #undef'd. */ +#ifndef __KVM_X86_PAGING_H +#define __KVM_X86_PAGING_H + +#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_LVL_ADDR_MASK(level) \ + (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#endif /* __KVM_X86_PAGING_H */ + diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 490a028ddabe..ee044d357b5f 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -24,7 +24,7 @@ #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -57,7 +57,7 @@ #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7a5ce9314107..eb7b227fc6cf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -38,12 +38,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #else #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) -- cgit v1.2.3 From 76ff371b67cb12fb635396234468abcf6a466f16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 24 Jun 2021 19:03:54 -0700 Subject: KVM: SVM: Revert clearing of C-bit on GPA in #NPF handler Don't clear the C-bit in the #NPF handler, as it is a legal GPA bit for non-SEV guests, and for SEV guests the C-bit is dropped before the GPA hits the NPT in hardware. Clearing the bit for non-SEV guests causes KVM to mishandle #NPFs with that collide with the host's C-bit. Although the APM doesn't explicitly state that the C-bit is not reserved for non-SEV, Tom Lendacky confirmed that the following snippet about the effective reduction due to the C-bit does indeed apply only to SEV guests. Note that because guest physical addresses are always translated through the nested page tables, the size of the guest physical address space is not impacted by any physical address space reduction indicated in CPUID 8000_001F[EBX]. If the C-bit is a physical address bit however, the guest physical address space is effectively reduced by 1 bit. And for SEV guests, the APM clearly states that the bit is dropped before walking the nested page tables. If the C-bit is an address bit, this bit is masked from the guest physical address when it is translated through the nested page tables. Consequently, the hypervisor does not need to be aware of which pages the guest has chosen to mark private. Note, the bogus C-bit clearing was removed from legacy #PF handler in commit 6d1b867d0456 ("KVM: SVM: Don't strip the C-bit from CR2 on #PF interception"). Fixes: 0ede79e13224 ("KVM: SVM: Clear C-bit from the page fault address") Cc: Peter Gonda Cc: Brijesh Singh Cc: Tom Lendacky Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210625020354.431829-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8834822c00cd..ca5614a48b21 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1923,7 +1923,7 @@ static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); + u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); -- cgit v1.2.3 From 7234c362ccb3c2228f06f19f93b132de9cfa7ae4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 28 Jun 2021 15:43:54 +0800 Subject: KVM: x86/pmu: Clear anythread deprecated bit when 0xa leaf is unsupported on the SVM The AMD platform does not support the functions Ah CPUID leaf. The returned results for this entry should all remain zero just like the native does: AMD host: 0x0000000a 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00000000 (uncanny) AMD guest: 0x0000000a 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000000 edx=0x00008000 Fixes: cadbaa039b99 ("perf/x86/intel: Make anythread filter support conditional") Signed-off-by: Like Xu Message-Id: <20210628074354.33848-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ca7866d63e98..739be5da3bca 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -765,7 +765,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; - edx.split.anythread_deprecated = 1; + if (cap.version) + edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; -- cgit v1.2.3 From f85d40160691881a17a397c448d799dfc90987ba Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 29 Jun 2021 01:26:32 +0800 Subject: KVM: X86: Disable hardware breakpoints unconditionally before kvm_x86->run() When the host is using debug registers but the guest is not using them nor is the guest in guest-debug state, the kvm code does not reset the host debug registers before kvm_x86->run(). Rather, it relies on the hardware vmentry instruction to automatically reset the dr7 registers which ensures that the host breakpoints do not affect the guest. This however violates the non-instrumentable nature around VM entry and exit; for example, when a host breakpoint is set on vcpu->arch.cr2, Another issue is consistency. When the guest debug registers are active, the host breakpoints are reset before kvm_x86->run(). But when the guest debug registers are inactive, the host breakpoints are delayed to be disabled. The host tracing tools may see different results depending on what the guest is doing. To fix the problems, we clear %db7 unconditionally before kvm_x86->run() if the host has set any breakpoints, no matter if the guest is using them or not. Signed-off-by: Lai Jiangshan Message-Id: <20210628172632.81029-1-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org [Only clear %db7 instead of reloading all debug registers. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf10d196ca65..c471b1375f36 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9597,6 +9597,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); } for (;;) { -- cgit v1.2.3 From c0e1303ed4cc9e7ce39f106b471ad92ca559e3d3 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 8 Jul 2021 07:57:02 +0800 Subject: KVM: VMX: Remove vmx_msr_index from vmx.h vmx_msr_index was used to record the list of MSRs which can be lazily restored when kvm returns to userspace. It is now reimplemented as kvm_uret_msrs_list, a common x86 list which is only used inside x86.c. So just remove the obsolete declaration in vmx.h. Signed-off-by: Yu Zhang Message-Id: <20210707235702.31595-1-yu.c.zhang@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3979a947933a..db88ed4f2121 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,8 +14,6 @@ #include "vmx_ops.h" #include "cpuid.h" -extern const u32 vmx_msr_index[]; - #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 -- cgit v1.2.3 From 991afbbee8ac93b055a27477278a5fb556af1ff4 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 7 Jul 2021 15:50:58 +0300 Subject: KVM: SVM: #SMI interception must not skip the instruction Commit 5ff3a351f687 ("KVM: x86: Move trivial instruction-based exit handlers to common code"), unfortunately made a mistake of treating nop_on_interception and nop_interception in the same way. Former does truly nothing while the latter skips the instruction. SMI VM exit handler should do nothing. (SMI itself is handled by the host when we do STGI) Fixes: 5ff3a351f687 ("KVM: x86: Move trivial instruction-based exit handlers to common code") Signed-off-by: Maxim Levitsky Message-Id: <20210707125100.677203-2-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca5614a48b21..80790faf6370 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2106,6 +2106,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu) return 1; } +static int smi_interception(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int intr_interception(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; @@ -3080,7 +3085,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, - [SVM_EXIT_SMI] = kvm_emulate_as_nop, + [SVM_EXIT_SMI] = smi_interception, [SVM_EXIT_INIT] = kvm_emulate_as_nop, [SVM_EXIT_VINTR] = interrupt_window_interception, [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, -- cgit v1.2.3 From 896707c212d440a6863ce0a3930c8a609e24497d Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 7 Jul 2021 15:50:59 +0300 Subject: KVM: SVM: remove INIT intercept handler Kernel never sends real INIT even to CPUs, other than on boot. Thus INIT interception is an error which should be caught by a check for an unknown VMexit reason. On top of that, the current INIT VM exit handler skips the current instruction which is wrong. That was added in commit 5ff3a351f687 ("KVM: x86: Move trivial instruction-based exit handlers to common code"). Fixes: 5ff3a351f687 ("KVM: x86: Move trivial instruction-based exit handlers to common code") Signed-off-by: Maxim Levitsky Message-Id: <20210707125100.677203-3-mlevitsk@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 80790faf6370..18aa4d45cc64 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3086,7 +3086,6 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = smi_interception, - [SVM_EXIT_INIT] = kvm_emulate_as_nop, [SVM_EXIT_VINTR] = interrupt_window_interception, [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, [SVM_EXIT_CPUID] = kvm_emulate_cpuid, -- cgit v1.2.3 From 4b639a9f82fcf15497d1613a29aa1df798a24029 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 7 Jul 2021 15:51:00 +0300 Subject: KVM: SVM: add module param to control the #SMI interception In theory there are no side effects of not intercepting #SMI, because then #SMI becomes transparent to the OS and the KVM. Plus an observation on recent Zen2 CPUs reveals that these CPUs ignore #SMI interception and never deliver #SMI VMexits. This is also useful to test nested KVM to see that L1 handles #SMIs correctly in case when L1 doesn't intercept #SMI. Finally the default remains the same, the SMI are intercepted by default thus this patch doesn't have any effect unless non default module param value is used. Signed-off-by: Maxim Levitsky Message-Id: <20210707125100.677203-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 ++++ arch/x86/kvm/svm/svm.c | 10 +++++++++- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 21d03e3a5dfd..2884c54a72bb 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -154,6 +154,10 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; + + /* If SMI is not intercepted, ignore guest SMI intercept as well */ + if (!intercept_smi) + vmcb_clr_intercept(c, INTERCEPT_SMI); } static void copy_vmcb_control_area(struct vmcb_control_area *dst, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 18aa4d45cc64..63488d3beb5b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -198,6 +198,11 @@ module_param(avic, bool, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); + +bool intercept_smi = true; +module_param(intercept_smi, bool, 0444); + + static bool svm_gp_erratum_intercept = true; static u8 rsm_ins_bytes[] = "\x0f\xaa"; @@ -1185,7 +1190,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_INTR); svm_set_intercept(svm, INTERCEPT_NMI); - svm_set_intercept(svm, INTERCEPT_SMI); + + if (intercept_smi) + svm_set_intercept(svm, INTERCEPT_SMI); + svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); svm_set_intercept(svm, INTERCEPT_RDPMC); svm_set_intercept(svm, INTERCEPT_CPUID); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f89b623bb591..8cb3bd59c5ea 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern bool intercept_smi; /* * Clean bits in VMCB. -- cgit v1.2.3 From b4a693924aab93f3747465b2261add46c82c3220 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 May 2021 10:58:25 -0700 Subject: KVM: SVM: Return -EFAULT if copy_to_user() for SEV mig packet header fails Return -EFAULT if copy_to_user() fails; if accessing user memory faults, copy_to_user() returns the number of bytes remaining, not an error code. Reported-by: Dan Carpenter Cc: Steve Rutherford Cc: Brijesh Singh Cc: Ashish Kalra Fixes: d3d1af85e2c7 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command") Signed-off-by: Sean Christopherson Message-Id: <20210506175826.2166383-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8d36f0c73071..3dc3e2897804 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1309,8 +1309,9 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* Copy packet header to userspace. */ - ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, - params.hdr_len); + if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + params.hdr_len)) + ret = -EFAULT; e_free_trans_data: kfree(trans_data); -- cgit v1.2.3 From c7a1b2b678c54ac19320daf525038d0e2e43ca7c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 May 2021 10:58:26 -0700 Subject: KVM: SVM: Fix sev_pin_memory() error checks in SEV migration utilities Use IS_ERR() instead of checking for a NULL pointer when querying for sev_pin_memory() failures. sev_pin_memory() always returns an error code cast to a pointer, or a valid pointer; it never returns NULL. Reported-by: Dan Carpenter Cc: Steve Rutherford Cc: Brijesh Singh Cc: Ashish Kalra Fixes: d3d1af85e2c7 ("KVM: SVM: Add KVM_SEND_UPDATE_DATA command") Fixes: 15fb7de1a7f5 ("KVM: SVM: Add KVM_SEV_RECEIVE_UPDATE_DATA command") Signed-off-by: Sean Christopherson Message-Id: <20210506175826.2166383-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 3dc3e2897804..02d60d7f903d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1271,8 +1271,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!guest_page) - return -EFAULT; + if (IS_ERR(guest_page)) + return PTR_ERR(guest_page); /* allocate memory for header and transport buffer */ ret = -ENOMEM; @@ -1463,11 +1463,12 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.trans_len = params.trans_len; /* Pin guest memory */ - ret = -EFAULT; guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!guest_page) + if (IS_ERR(guest_page)) { + ret = PTR_ERR(guest_page); goto e_free_trans; + } /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; -- cgit v1.2.3 From fce7e152ffc8f89d02a80617b16c7aa1527847c8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 28 Jun 2021 12:44:20 +0200 Subject: KVM: nSVM: Check the value written to MSR_VM_HSAVE_PA APM states that #GP is raised upon write to MSR_VM_HSAVE_PA when the supplied address is not page-aligned or is outside of "maximum supported physical address for this implementation". page_address_valid() check seems suitable. Also, forcefully page-align the address when it's written from VMM. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210628104425.391276-2-vkuznets@redhat.com> Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky [Add comment about behavior for host-provided values. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 63488d3beb5b..b6e91de891cb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2954,7 +2954,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: - svm->nested.hsave_msr = data; + /* + * Old kernels did not validate the value written to + * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid + * value to allow live migrating buggy or malicious guests + * originating from those kernels. + */ + if (!msr->host_initiated && !page_address_valid(vcpu, data)) + return 1; + + svm->nested.hsave_msr = data & PAGE_MASK; break; case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); -- cgit v1.2.3 From fb79f566e4c99db8647cf0435e3732f12e856ab0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 28 Jun 2021 12:44:21 +0200 Subject: KVM: nSVM: Check that VM_HSAVE_PA MSR was set before VMRUN APM states that "The address written to the VM_HSAVE_PA MSR, which holds the address of the page used to save the host state on a VMRUN, must point to a hypervisor-owned page. If this check fails, the WRMSR will fail with a #GP(0) exception. Note that a value of 0 is not considered valid for the VM_HSAVE_PA MSR and a VMRUN that is attempted while the HSAVE_PA is 0 will fail with a #GP(0) exception." svm_set_msr() already checks that the supplied address is valid, so only check for '0' is missing. Add it to nested_svm_vmrun(). Signed-off-by: Vitaly Kuznetsov Message-Id: <20210628104425.391276-3-vkuznets@redhat.com> Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 2884c54a72bb..ec16a06f9aa8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -622,6 +622,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct kvm_host_map map; u64 vmcb12_gpa; + if (!svm->nested.hsave_msr) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (is_smm(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; -- cgit v1.2.3 From 0a758290762cf6fb69ad09712ac834cd4f07504f Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 28 Jun 2021 12:44:22 +0200 Subject: KVM: nSVM: Introduce svm_copy_vmrun_state() Separate the code setting non-VMLOAD-VMSAVE state from svm_set_nested_state() into its own function. This is going to be re-used from svm_enter_smm()/svm_leave_smm(). Signed-off-by: Vitaly Kuznetsov Message-Id: <20210628104425.391276-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 40 ++++++++++++++++++++++------------------ arch/x86/kvm/svm/svm.h | 2 ++ 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index ec16a06f9aa8..c4296fb4b8be 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -701,6 +701,27 @@ out: return ret; } +/* Copy state save area fields which are handled by VMRUN */ +void svm_copy_vmrun_state(struct vmcb_save_area *from_save, + struct vmcb_save_area *to_save) +{ + to_save->es = from_save->es; + to_save->cs = from_save->cs; + to_save->ss = from_save->ss; + to_save->ds = from_save->ds; + to_save->gdtr = from_save->gdtr; + to_save->idtr = from_save->idtr; + to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED; + to_save->efer = from_save->efer; + to_save->cr0 = from_save->cr0; + to_save->cr3 = from_save->cr3; + to_save->cr4 = from_save->cr4; + to_save->rax = from_save->rax; + to_save->rsp = from_save->rsp; + to_save->rip = from_save->rip; + to_save->cpl = 0; +} + void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; @@ -1364,28 +1385,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - svm->vmcb01.ptr->save.es = save->es; - svm->vmcb01.ptr->save.cs = save->cs; - svm->vmcb01.ptr->save.ss = save->ss; - svm->vmcb01.ptr->save.ds = save->ds; - svm->vmcb01.ptr->save.gdtr = save->gdtr; - svm->vmcb01.ptr->save.idtr = save->idtr; - svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED; - svm->vmcb01.ptr->save.efer = save->efer; - svm->vmcb01.ptr->save.cr0 = save->cr0; - svm->vmcb01.ptr->save.cr3 = save->cr3; - svm->vmcb01.ptr->save.cr4 = save->cr4; - svm->vmcb01.ptr->save.rax = save->rax; - svm->vmcb01.ptr->save.rsp = save->rsp; - svm->vmcb01.ptr->save.rip = save->rip; - svm->vmcb01.ptr->save.cpl = 0; - + svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save); nested_load_control_from_vmcb12(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); - kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 8cb3bd59c5ea..fe87fd68b73b 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -464,6 +464,8 @@ void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); +void svm_copy_vmrun_state(struct vmcb_save_area *from_save, + struct vmcb_save_area *to_save); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); -- cgit v1.2.3 From 37be407b2ce807179108eeac788805848fe048f1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 28 Jun 2021 12:44:23 +0200 Subject: KVM: nSVM: Fix L1 state corruption upon return from SMM VMCB split commit 4995a3685f1b ("KVM: SVM: Use a separate vmcb for the nested L2 guest") broke return from SMM when we entered there from guest (L2) mode. Gen2 WS2016/Hyper-V is known to do this on boot. The problem manifests itself like this: kvm_exit: reason EXIT_RSM rip 0x7ffbb280 info 0 0 kvm_emulate_insn: 0:7ffbb280: 0f aa kvm_smm_transition: vcpu 0: leaving SMM, smbase 0x7ffb3000 kvm_nested_vmrun: rip: 0x000000007ffbb280 vmcb: 0x0000000008224000 nrip: 0xffffffffffbbe119 int_ctl: 0x01020000 event_inj: 0x00000000 npt: on kvm_nested_intercepts: cr_read: 0000 cr_write: 0010 excp: 40060002 intercepts: fd44bfeb 0000217f 00000000 kvm_entry: vcpu 0, rip 0xffffffffffbbe119 kvm_exit: reason EXIT_NPF rip 0xffffffffffbbe119 info 200000006 1ab000 kvm_nested_vmexit: vcpu 0 reason npf rip 0xffffffffffbbe119 info1 0x0000000200000006 info2 0x00000000001ab000 intr_info 0x00000000 error_code 0x00000000 kvm_page_fault: address 1ab000 error_code 6 kvm_nested_vmexit_inject: reason EXIT_NPF info1 200000006 info2 1ab000 int_info 0 int_info_err 0 kvm_entry: vcpu 0, rip 0x7ffbb280 kvm_exit: reason EXIT_EXCP_GP rip 0x7ffbb280 info 0 0 kvm_emulate_insn: 0:7ffbb280: 0f aa kvm_inj_exception: #GP (0x0) Note: return to L2 succeeded but upon first exit to L1 its RIP points to 'RSM' instruction but we're not in SMM. The problem appears to be that VMCB01 gets irreversibly destroyed during SMM execution. Previously, we used to have 'hsave' VMCB where regular (pre-SMM) L1's state was saved upon nested_svm_vmexit() but now we just switch to VMCB01 from VMCB02. Pre-split (working) flow looked like: - SMM is triggered during L2's execution - L2's state is pushed to SMRAM - nested_svm_vmexit() restores L1's state from 'hsave' - SMM -> RSM - enter_svm_guest_mode() switches to L2 but keeps 'hsave' intact so we have pre-SMM (and pre L2 VMRUN) L1's state there - L2's state is restored from SMRAM - upon first exit L1's state is restored from L1. This was always broken with regards to svm_get_nested_state()/ svm_set_nested_state(): 'hsave' was never a part of what's being save and restored so migration happening during SMM triggered from L2 would never restore L1's state correctly. Post-split flow (broken) looks like: - SMM is triggered during L2's execution - L2's state is pushed to SMRAM - nested_svm_vmexit() switches to VMCB01 from VMCB02 - SMM -> RSM - enter_svm_guest_mode() switches from VMCB01 to VMCB02 but pre-SMM VMCB01 is already lost. - L2's state is restored from SMRAM - upon first exit L1's state is restored from VMCB01 but it is corrupted (reflects the state during 'RSM' execution). VMX doesn't have this problem because unlike VMCB, VMCS keeps both guest and host state so when we switch back to VMCS02 L1's state is intact there. To resolve the issue we need to save L1's state somewhere. We could've created a third VMCB for SMM but that would require us to modify saved state format. L1's architectural HSAVE area (pointed by MSR_VM_HSAVE_PA) seems appropriate: L0 is free to save any (or none) of L1's state there. Currently, KVM does 'none'. Note, for nested state migration to succeed, both source and destination hypervisors must have the fix. We, however, don't need to create a new flag indicating the fact that HSAVE area is now populated as migration during SMM triggered from L2 was always broken. Fixes: 4995a3685f1b ("KVM: SVM: Use a separate vmcb for the nested L2 guest") Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b6e91de891cb..cf8471890266 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4309,6 +4309,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_host_map map_save; int ret; if (is_guest_mode(vcpu)) { @@ -4324,6 +4325,29 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) ret = nested_svm_vmexit(svm); if (ret) return ret; + + /* + * KVM uses VMCB01 to store L1 host state while L2 runs but + * VMCB01 is going to be used during SMM and thus the state will + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the + * format of the area is identical to guest save area offsetted + * by 0x400 (matches the offset of 'struct vmcb_save_area' + * within 'struct vmcb'). Note: HSAVE area may also be used by + * L1 hypervisor to save additional host context (e.g. KVM does + * that, see svm_prepare_guest_switch()) which must be + * preserved. + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; + + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); + + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, + map_save.hva + 0x400); + + kvm_vcpu_unmap(vcpu, &map_save, true); } return 0; } @@ -4331,7 +4355,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_host_map map; + struct kvm_host_map map, map_save; int ret = 0; if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { @@ -4355,6 +4379,19 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva); kvm_vcpu_unmap(vcpu, &map, true); + + /* + * Restore L1 host state from L1 HSAVE area as VMCB01 was + * used during SMM (see svm_enter_smm()) + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; + + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); + + kvm_vcpu_unmap(vcpu, &map_save, true); } } -- cgit v1.2.3 From bb00bd9c0862558c6528e3ac97470aee222436ef Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 28 Jun 2021 12:44:24 +0200 Subject: KVM: nSVM: Restore nested control upon leaving SMM If the VM was migrated while in SMM, no nested state was saved/restored, and therefore svm_leave_smm has to load both save and control area of the vmcb12. Save area is already loaded from HSAVE area, so now load the control area as well from the vmcb12. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210628104425.391276-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/svm.c | 7 ++++++- arch/x86/kvm/svm/svm.h | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c4296fb4b8be..3bd09c50c98b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -308,8 +308,8 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, return true; } -static void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control) { copy_vmcb_control_area(&svm->nested.ctl, control); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cf8471890266..664d20f0689c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4362,6 +4362,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); + struct vmcb *vmcb12; if (guest) { if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) @@ -4377,7 +4378,11 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva); + vmcb12 = map.hva; + + nested_load_control_from_vmcb12(svm, &vmcb12->control); + + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); kvm_vcpu_unmap(vcpu, &map, true); /* diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fe87fd68b73b..7e2090752d8f 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -482,6 +482,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); +void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); -- cgit v1.2.3 From d8a719059b9dc963aa190598778ac804ff3e6a87 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Wed, 21 Jul 2021 17:02:13 +1000 Subject: Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge" This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf. c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") breaks arm64 in at least two ways for configurations where PUD or PMD folding occur: 1. We no longer install huge-vmap mappings and silently fall back to page-granular entries, despite being able to install block entries at what is effectively the PGD level. 2. If the linear map is backed with block mappings, these will now silently fail to be created in alloc_init_pud(), causing a panic early during boot. The pgtable selftests caught this, although a fix has not been forthcoming and Christophe is AWOL at the moment, so just revert the change for now to get a working -rc3 on which we can queue patches for 5.15. A simple revert breaks the build for 32-bit PowerPC 8xx machines, which rely on the default function definitions when the corresponding page-table levels are folded, since commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge pages on VMAP and VMALLOC"), eg: powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range': linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge' To avoid that, add stubs for pud_clear_huge() and pmd_clear_huge() in arch/powerpc/mm/nohash/8xx.c as suggested by Christophe. Cc: Christophe Leroy Cc: Catalin Marinas Cc: Andrew Morton Cc: Nicholas Piggin Cc: Mike Rapoport Cc: Mark Rutland Cc: Geert Uytterhoeven Fixes: c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") Signed-off-by: Jonathan Marek Reviewed-by: Ard Biesheuvel Acked-by: Marc Zyngier [mpe: Fold in 8xx.c changes from Christophe and mention in change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/linux-arm-kernel/CAMuHMdXShORDox-xxaeUfDW3wx2PeggFSqhVSHVZNKCGK-y_vQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20210717160118.9855-1-jonathan@marek.ca Link: https://lore.kernel.org/r/87r1fs1762.fsf@mpe.ellerman.id.au Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 20 ++++++++------------ arch/powerpc/mm/nohash/8xx.c | 10 ++++++++++ arch/x86/mm/pgtable.c | 34 +++++++++++++++------------------- include/linux/pgtable.h | 26 +------------------------- 4 files changed, 34 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d74586508448..9ff0de1b2b93 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1339,7 +1339,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } -#if CONFIG_PGTABLE_LEVELS > 3 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); @@ -1354,16 +1353,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) return 1; } -int pud_clear_huge(pud_t *pudp) -{ - if (!pud_sect(READ_ONCE(*pudp))) - return 0; - pud_clear(pudp); - return 1; -} -#endif - -#if CONFIG_PGTABLE_LEVELS > 2 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); @@ -1378,6 +1367,14 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } +int pud_clear_huge(pud_t *pudp) +{ + if (!pud_sect(READ_ONCE(*pudp))) + return 0; + pud_clear(pudp); + return 1; +} + int pmd_clear_huge(pmd_t *pmdp) { if (!pmd_sect(READ_ONCE(*pmdp))) @@ -1385,7 +1382,6 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } -#endif int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) { diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 60780e089118..0df9fe29dd56 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -240,3 +240,13 @@ void __init setup_kuap(bool disabled) mtspr(SPRN_MD_AP, MD_APG_KUAP); } #endif + +int pud_clear_huge(pud_t *pud) +{ + return 0; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3364fe62b903..3481b35cb4ec 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -682,7 +682,6 @@ int p4d_clear_huge(p4d_t *p4d) } #endif -#if CONFIG_PGTABLE_LEVELS > 3 /** * pud_set_huge - setup kernel PUD mapping * @@ -721,23 +720,6 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } -/** - * pud_clear_huge - clear kernel PUD mapping when it is set - * - * Returns 1 on success and 0 on failure (no PUD map is found). - */ -int pud_clear_huge(pud_t *pud) -{ - if (pud_large(*pud)) { - pud_clear(pud); - return 1; - } - - return 0; -} -#endif - -#if CONFIG_PGTABLE_LEVELS > 2 /** * pmd_set_huge - setup kernel PMD mapping * @@ -768,6 +750,21 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + /** * pmd_clear_huge - clear kernel PMD mapping when it is set * @@ -782,7 +779,6 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } -#endif #ifdef CONFIG_X86_64 /** diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d147480cdefc..e24d2c992b11 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1397,34 +1397,10 @@ static inline int p4d_clear_huge(p4d_t *p4d) } #endif /* !__PAGETABLE_P4D_FOLDED */ -#ifndef __PAGETABLE_PUD_FOLDED int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); -int pud_clear_huge(pud_t *pud); -#else -static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pud_clear_huge(pud_t *pud) -{ - return 0; -} -#endif /* !__PAGETABLE_PUD_FOLDED */ - -#ifndef __PAGETABLE_PMD_FOLDED int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); -#else -static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pmd_clear_huge(pmd_t *pmd) -{ - return 0; -} -#endif /* !__PAGETABLE_PMD_FOLDED */ - int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); -- cgit v1.2.3 From f5a11c69b69923a4367d24365ad4dff6d4f3fc42 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 21 Jul 2021 15:55:43 +0000 Subject: Revert "x86/hyperv: fix logical processor creation" This reverts commit 450605c28d571eddca39a65fdbc1338add44c6d9. Signed-off-by: Wei Liu --- arch/x86/kernel/cpu/mshyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index cc8f1773deca..c890d67a64ad 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -237,7 +237,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) for_each_present_cpu(i) { if (i == 0) continue; - ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i); + ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i)); BUG_ON(ret); } -- cgit v1.2.3 From 9a9e74819bb0e4694279fb437e136fe485878d25 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 16 Jul 2021 16:41:04 +0200 Subject: KVM: nSVM: Rename nested_svm_vmloadsave() to svm_copy_vmloadsave_state() To match svm_copy_vmrun_state(), rename nested_svm_vmloadsave() to svm_copy_vmloadsave_state(). Opportunistically add missing braces to 'else' branch in vmload_vmsave_interception(). No functional change intended. Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Message-Id: <20210716144104.465269-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 7 ++++--- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3bd09c50c98b..8493592b63b4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -722,7 +722,7 @@ void svm_copy_vmrun_state(struct vmcb_save_area *from_save, to_save->cpl = 0; } -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +void svm_copy_vmloadsave_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 664d20f0689c..cfe165d74093 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2147,11 +2147,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); if (vmload) { - nested_svm_vmloadsave(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(vmcb12, svm->vmcb); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; - } else - nested_svm_vmloadsave(svm->vmcb, vmcb12); + } else { + svm_copy_vmloadsave_state(svm->vmcb, vmcb12); + } kvm_vcpu_unmap(vcpu, &map, true); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7e2090752d8f..1b65ee3a9569 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -466,7 +466,7 @@ int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); void svm_copy_vmrun_state(struct vmcb_save_area *from_save, struct vmcb_save_area *to_save); -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); +void svm_copy_vmloadsave_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) -- cgit v1.2.3 From 2bb16bea5feaa582fbbdbfd84ecaa1ab61bbb34c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 19 Jul 2021 11:03:22 +0200 Subject: KVM: nSVM: Swap the parameter order for svm_copy_vmrun_state()/svm_copy_vmloadsave_state() Make svm_copy_vmrun_state()/svm_copy_vmloadsave_state() interface match 'memcpy(dest, src)' to avoid any confusion. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Vitaly Kuznetsov Message-Id: <20210719090322.625277-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 8 ++++---- arch/x86/kvm/svm/svm.c | 12 ++++++------ arch/x86/kvm/svm/svm.h | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 8493592b63b4..1c2a0414a88d 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -702,8 +702,8 @@ out: } /* Copy state save area fields which are handled by VMRUN */ -void svm_copy_vmrun_state(struct vmcb_save_area *from_save, - struct vmcb_save_area *to_save) +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save) { to_save->es = from_save->es; to_save->cs = from_save->cs; @@ -722,7 +722,7 @@ void svm_copy_vmrun_state(struct vmcb_save_area *from_save, to_save->cpl = 0; } -void svm_copy_vmloadsave_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1385,7 +1385,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - svm_copy_vmrun_state(save, &svm->vmcb01.ptr->save); + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); nested_load_control_from_vmcb12(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cfe165d74093..9a6987549e1b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2147,11 +2147,11 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); if (vmload) { - svm_copy_vmloadsave_state(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(svm->vmcb, vmcb12); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; } else { - svm_copy_vmloadsave_state(svm->vmcb, vmcb12); + svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } kvm_vcpu_unmap(vcpu, &map, true); @@ -4345,8 +4345,8 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); - svm_copy_vmrun_state(&svm->vmcb01.ptr->save, - map_save.hva + 0x400); + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); kvm_vcpu_unmap(vcpu, &map_save, true); } @@ -4394,8 +4394,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) &map_save) == -EINVAL) return 1; - svm_copy_vmrun_state(map_save.hva + 0x400, - &svm->vmcb01.ptr->save); + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, + map_save.hva + 0x400); kvm_vcpu_unmap(vcpu, &map_save, true); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 1b65ee3a9569..bd0fe94c2920 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -464,9 +464,9 @@ void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); -void svm_copy_vmrun_state(struct vmcb_save_area *from_save, - struct vmcb_save_area *to_save); -void svm_copy_vmloadsave_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb); +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save); +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) -- cgit v1.2.3 From 0a31df6823232516f61f174907e444f710941dfe Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 22 Jul 2021 14:30:18 +0200 Subject: KVM: x86: Check the right feature bit for MSR_KVM_ASYNC_PF_ACK access MSR_KVM_ASYNC_PF_ACK MSR is part of interrupt based asynchronous page fault interface and not the original (deprecated) KVM_FEATURE_ASYNC_PF. This is stated in Documentation/virt/kvm/msr.rst. Fixes: 66570e966dd9 ("kvm: x86: only provide PV features if enabled in guest's CPUID") Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Reviewed-by: Oliver Upton Message-Id: <20210722123018.260035-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a4fd10604f72..4116567f3d44 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3407,7 +3407,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; @@ -3746,7 +3746,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; -- cgit v1.2.3 From 76b4f357d0e7d8f6f0013c733e6cba1773c266d3 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 1 Jul 2021 17:41:00 +0200 Subject: x86/kvm: fix vcpu-id indexed array sizes KVM_MAX_VCPU_ID is the maximum vcpu-id of a guest, and not the number of vcpu-ids. Fix array indexed by vcpu-id to have KVM_MAX_VCPU_ID+1 elements. Note that this is currently no real problem, as KVM_MAX_VCPU_ID is an odd number, resulting in always enough padding being available at the end of those arrays. Nevertheless this should be fixed in order to avoid rare problems in case someone is using an even number for KVM_MAX_VCPU_ID. Signed-off-by: Juergen Gross Message-Id: <20210701154105.23215-2-jgross@suse.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 2 +- arch/x86/kvm/ioapic.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 698969e18fe3..ff005fe738a4 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 660401700075..11e4065e1617 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -43,13 +43,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); + DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID]; + u8 vectors[KVM_MAX_VCPU_ID + 1]; }; -- cgit v1.2.3 From 3fa5e8fd0a0e4ccc03c91df225be2e9b7100800c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 26 Jul 2021 12:39:01 -0400 Subject: KVM: SVM: delay svm_vcpu_init_msrpm after svm->vmcb is initialized Right now, svm_hv_vmcb_dirty_nested_enlightenments has an incorrect dereference of vmcb->control.reserved_sw before the vmcb is checked for being non-NULL. The compiler is usually sinking the dereference after the check; instead of doing this ourselves in the source, ensure that svm_hv_vmcb_dirty_nested_enlightenments is only called with a non-NULL VMCB. Reported-by: Dan Carpenter Cc: Vineeth Pillai Signed-off-by: Paolo Bonzini [Untested for now due to issues with my AMD machine. - Paolo] --- arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm_onhyperv.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9a6987549e1b..4bcb95bb8ed7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1406,8 +1406,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); @@ -1419,6 +1417,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_switch_vmcb(svm, &svm->vmcb01); init_vmcb(vcpu); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 9b9a55abc29f..c53b8bf8d013 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -89,7 +89,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( * as we mark it dirty unconditionally towards end of vcpu * init phase. */ - if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && + if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } -- cgit v1.2.3 From f1577ab21442476a1015d09e861c08ca76262c06 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 13 Jul 2021 17:20:16 +0300 Subject: KVM: SVM: svm_set_vintr don't warn if AVIC is active but is about to be deactivated It is possible for AVIC inhibit and AVIC active state to be mismatched. Currently we disable AVIC right away on vCPU which started the AVIC inhibit request thus this warning doesn't trigger but at least in theory, if svm_set_vintr is called at the same time on multiple vCPUs, the warning can happen. Signed-off-by: Maxim Levitsky Message-Id: <20210713142023.106183-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4bcb95bb8ed7..e8ccab50ebf6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1568,8 +1568,11 @@ static void svm_set_vintr(struct vcpu_svm *svm) { struct vmcb_control_area *control; - /* The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + /* + * The following fields are ignored when AVIC is enabled + */ + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + svm_set_intercept(svm, INTERCEPT_VINTR); /* -- cgit v1.2.3 From feea01360cb1925dd31a3d38514eb86f61d69468 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 13 Jul 2021 17:20:17 +0300 Subject: KVM: SVM: tweak warning about enabled AVIC on nested entry It is possible that AVIC was requested to be disabled but not yet disabled, e.g if the nested entry is done right after svm_vcpu_after_set_cpuid. Signed-off-by: Maxim Levitsky Message-Id: <20210713142023.106183-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1c2a0414a88d..61738ff8ef33 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -515,7 +515,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, * avic_physical_id. */ - WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); /* Copied from vmcb01. msrpm_base can be overwritten later. */ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; -- cgit v1.2.3 From 5868b8225ecef4ba3f5b17e65984d60bc5fd6254 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Tue, 13 Jul 2021 17:20:18 +0300 Subject: KVM: SVM: use vmcb01 in svm_refresh_apicv_exec_ctrl Currently when SVM is enabled in guest CPUID, AVIC is inhibited as soon as the guest CPUID is set. AVIC happens to be fully disabled on all vCPUs by the time any guest entry starts (if after migration the entry can be nested). The reason is that currently we disable avic right away on vCPU from which the kvm_request_apicv_update was called and for this case, it happens to be called on all vCPUs (by svm_vcpu_after_set_cpuid). After we stop doing this, AVIC will end up being disabled only when KVM_REQ_APICV_UPDATE is processed which is after we done switching to the nested guest. Fix this by just using vmcb01 in svm_refresh_apicv_exec_ctrl for avic (which is a right thing to do anyway). Signed-off-by: Maxim Levitsky Message-Id: <20210713142023.106183-4-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1d01da64c333..a8ad78a2faa1 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -646,7 +646,7 @@ out: void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = svm->vmcb01.ptr; bool activated = kvm_vcpu_apicv_active(vcpu); if (!enable_apicv) -- cgit v1.2.3 From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/riscv/net/bpf_jit_comp32.c | 4 ++++ arch/riscv/net/bpf_jit_comp64.c | 4 ++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ arch/x86/net/bpf_jit_comp32.c | 6 ++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 19 ++++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 14 files changed, 102 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 897634d0a67c..a951276f0547 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1602,6 +1602,9 @@ exit: rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dccf98a37283..41c23f474ea6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -823,6 +823,19 @@ emit_cond_jmp: return ret; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 939dd06764bc..3a73e9375712 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1355,6 +1355,9 @@ jeq_common: } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 34bb1583fc0c..beb12cbc8c29 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index de8595880fee..b87a63dba9c8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -627,6 +627,12 @@ emit_clear: } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 81de865f4c7c..e6497424cbf6 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_W: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 87e3bf5b9086..3af4131c22c7 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -939,6 +939,10 @@ out_be: emit_ld(rd, 0, RV_REG_T1, ctx); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: emit_imm(RV_REG_T1, imm, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2ae419f5115a..88419263a89a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 4b8d3c65d266..9a2f20cbd48b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return 1; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4b951458c9fc..16d76f814e9b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1219,6 +1219,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3da88ded6ee3..3bfda5f502cb 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, i++; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..83b896044e79 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -73,6 +73,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b1577498373..b1a5fc04492b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,8 @@ #include #include #include + +#include #include /* Registers */ @@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, @@ -1621,7 +1624,21 @@ out: COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bbfc6bb79240..ca3cd9aaa6ce 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); -- cgit v1.2.3 From fa7a549d321a4189677b0cea86e58d9db7977f7b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 Jul 2021 17:37:49 -0400 Subject: KVM: x86: accept userspace interrupt only if no event is injected Once an exception has been injected, any side effects related to the exception (such as setting CR2 or DR6) have been taked place. Therefore, once KVM sets the VM-entry interruption information field or the AMD EVENTINJ field, the next VM-entry must deliver that exception. Pending interrupts are processed after injected exceptions, so in theory it would not be a problem to use KVM_INTERRUPT when an injected exception is present. However, DOSEMU is using run->ready_for_interrupt_injection to detect interrupt windows and then using KVM_SET_SREGS/KVM_SET_REGS to inject the interrupt manually. For this to work, the interrupt window must be delayed after the completion of the previous event injection. Cc: stable@vger.kernel.org Reported-by: Stas Sergeev Tested-by: Stas Sergeev Fixes: 71cc849b7093 ("KVM: x86: Fix split-irqchip vs interrupt injection window request") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4116567f3d44..e5d5c5ed7dd4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4358,8 +4358,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { - return kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_accept_dm_intr(vcpu); + /* + * Do not cause an interrupt window exit if an exception + * is pending or an event needs reinjection; userspace + * might want to inject the interrupt manually using KVM_SET_REGS + * or KVM_SET_SREGS. For that to work, we must be at an + * instruction boundary and with no events half-injected. + */ + return (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_accept_dm_intr(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + !vcpu->arch.exception.pending); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 094121ef815f29d9e6a01fafca365831454ce293 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 28 Jul 2021 20:21:15 +0200 Subject: arch: Kconfig: clean up obsolete use of HAVE_IDE The arch-specific Kconfig files use HAVE_IDE to indicate if IDE is supported. As IDE support and the HAVE_IDE config vanishes with commit b7fb14d3ac63 ("ide: remove the legacy ide driver"), there is no need to mention HAVE_IDE in all those arch-specific Kconfig files. The issue was identified with ./scripts/checkkconfigsymbols.py. Fixes: b7fb14d3ac63 ("ide: remove the legacy ide driver") Suggested-by: Randy Dunlap Signed-off-by: Lukas Bulwahn Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210728182115.4401-1-lukas.bulwahn@gmail.com Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven Signed-off-by: Jens Axboe --- arch/alpha/Kconfig | 1 - arch/arm/Kconfig | 6 ------ arch/arm/mach-davinci/Kconfig | 1 - arch/h8300/Kconfig.cpu | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - 13 files changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 77d3280dc678..a6d4c2f744e3 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -14,7 +14,6 @@ config ALPHA select PCI_SYSCALL if PCI select HAVE_AOUT select HAVE_ASM_MODVERSIONS - select HAVE_IDE select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select NEED_DMA_MAP_STATE diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 06b6187b67af..f2ce83e643e7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -95,7 +95,6 @@ config ARM select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) - select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -361,7 +360,6 @@ config ARCH_FOOTBRIDGE bool "FootBridge" select CPU_SA110 select FOOTBRIDGE - select HAVE_IDE select NEED_MACH_IO_H if !MMU select NEED_MACH_MEMORY_H help @@ -429,7 +427,6 @@ config ARCH_PXA select GENERIC_IRQ_MULTI_HANDLER select GPIO_PXA select GPIOLIB - select HAVE_IDE select IRQ_DOMAIN select PLAT_PXA select SPARSE_IRQ @@ -445,7 +442,6 @@ config ARCH_RPC select ARM_HAS_SG_CHAIN select CPU_SA110 select FIQ - select HAVE_IDE select HAVE_PATA_PLATFORM select ISA_DMA_API select LEGACY_TIMER_TICK @@ -468,7 +464,6 @@ config ARCH_SA1100 select CPU_SA1100 select GENERIC_IRQ_MULTI_HANDLER select GPIOLIB - select HAVE_IDE select IRQ_DOMAIN select ISA select NEED_MACH_MEMORY_H @@ -504,7 +499,6 @@ config ARCH_OMAP1 select GENERIC_IRQ_CHIP select GENERIC_IRQ_MULTI_HANDLER select GPIOLIB - select HAVE_IDE select HAVE_LEGACY_CLK select IRQ_DOMAIN select NEED_MACH_IO_H if PCCARD diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig index de11030748d0..1d3aef84287d 100644 --- a/arch/arm/mach-davinci/Kconfig +++ b/arch/arm/mach-davinci/Kconfig @@ -9,7 +9,6 @@ menuconfig ARCH_DAVINCI select PM_GENERIC_DOMAINS_OF if PM && OF select REGMAP_MMIO select RESET_CONTROLLER - select HAVE_IDE select PINCTRL_SINGLE if ARCH_DAVINCI diff --git a/arch/h8300/Kconfig.cpu b/arch/h8300/Kconfig.cpu index b5e14d513e62..c30baa0499fc 100644 --- a/arch/h8300/Kconfig.cpu +++ b/arch/h8300/Kconfig.cpu @@ -44,7 +44,6 @@ config H8300_H8MAX bool "H8MAX" select H83069 select RAMKERNEL - select HAVE_IDE help H8MAX Evaluation Board Support More Information. (Japanese Only) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index cf425c2c63af..4993c7ac7ff6 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -25,7 +25,6 @@ config IA64 select HAVE_ASM_MODVERSIONS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_EXIT_THREAD - select HAVE_IDE select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 96989ad46f66..d632a1d576f9 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -23,7 +23,6 @@ config M68K select HAVE_DEBUG_BUGVERBOSE select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_HAS_NO_UNALIGNED select HAVE_FUTEX_CMPXCHG if MMU && FUTEX - select HAVE_IDE select HAVE_MOD_ARCH_SPECIFIC select HAVE_UID16 select MMU_GATHER_NO_RANGE if MMU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cee6087cd686..6dfb27d531dd 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -71,7 +71,6 @@ config MIPS select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO - select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index bde9907bc5b2..4f8c1fbf8f2f 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -3,7 +3,6 @@ config PARISC def_bool y select ARCH_32BIT_OFF_T if !64BIT select ARCH_MIGHT_HAVE_PC_PARPORT - select HAVE_IDE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d01e3401581d..663766fbf505 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -220,7 +220,6 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC_BOOK3S_64 && SMP select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) - select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 45a0549421cd..b683b69a4556 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -39,7 +39,6 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_FTRACE_MCOUNT_RECORD select HAVE_HW_BREAKPOINT - select HAVE_IDE if HAS_IOPORT_MAP select HAVE_IOREMAP_PROT if MMU && !X2TLB select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c5fa7932b550..f0c0f955e169 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -19,7 +19,6 @@ config SPARC select OF select OF_PROMTREE select HAVE_ASM_MODVERSIONS - select HAVE_IDE select HAVE_ARCH_KGDB if !SMP || SPARC64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_SECCOMP if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 49270655e827..88fb922c23a0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -202,7 +202,6 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT - select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2332b2156993..3878880469d1 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -327,7 +327,6 @@ config XTENSA_PLATFORM_ISS config XTENSA_PLATFORM_XT2000 bool "XT2000" - select HAVE_IDE help XT2000 is the name of Tensilica's feature-rich emulation platform. This hardware is capable of running a full Linux distribution. -- cgit v1.2.3 From 2e2f1e8d0450c561c0c936b4b67e8b5a95975fb7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 30 Jul 2021 14:26:22 +0200 Subject: KVM: x86: hyper-v: Check access to hypercall before reading XMM registers In case guest doesn't have access to the particular hypercall we can avoid reading XMM registers. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Siddharth Chandrasekaran Signed-off-by: Paolo Bonzini Message-Id: <20210730122625.112848-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b07592ca92f0..cb7e045905a5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2173,9 +2173,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - if (hc.fast && is_xmm_fast_hypercall(&hc)) - kvm_hv_hypercall_read_xmm(&hc); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); @@ -2184,6 +2181,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) goto hypercall_complete; } + if (hc.fast && is_xmm_fast_hypercall(&hc)) + kvm_hv_hypercall_read_xmm(&hc); + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep)) { -- cgit v1.2.3 From f5714bbb5b3120b33dfbf3d81ffc0b98ae4cd4c1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 30 Jul 2021 14:26:23 +0200 Subject: KVM: x86: Introduce trace_kvm_hv_hypercall_done() Hypercall failures are unusual with potentially far going consequences so it would be useful to see their results when tracing. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Siddharth Chandrasekaran Signed-off-by: Paolo Bonzini Message-Id: <20210730122625.112848-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 1 + arch/x86/kvm/trace.h | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cb7e045905a5..2945b93dbadd 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2016,6 +2016,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b484141ea15b..03ebe368333e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -92,6 +92,21 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->outgpa) ); +TRACE_EVENT(kvm_hv_hypercall_done, + TP_PROTO(u64 result), + TP_ARGS(result), + + TP_STRUCT__entry( + __field(__u64, result) + ), + + TP_fast_assign( + __entry->result = result; + ), + + TP_printk("result 0x%llx", __entry->result) +); + /* * Tracepoint for Xen hypercall. */ -- cgit v1.2.3 From 4e62aa96d6e55c1b2a4e841f1f8601eae81e81ae Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 30 Jul 2021 14:26:24 +0200 Subject: KVM: x86: hyper-v: Check if guest is allowed to use XMM registers for hypercall input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TLFS states that "Availability of the XMM fast hypercall interface is indicated via the “Hypervisor Feature Identification” CPUID Leaf (0x40000003, see section 2.4.4) ... Any attempt to use this interface when the hypervisor does not indicate availability will result in a #UD fault." Implement the check for 'strict' mode (KVM_CAP_HYPERV_ENFORCE_CPUID). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Siddharth Chandrasekaran Signed-off-by: Paolo Bonzini Message-Id: <20210730122625.112848-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 2945b93dbadd..0b38f944c6b6 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2140,6 +2140,7 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; @@ -2177,13 +2178,21 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); - if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) { + if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } - if (hc.fast && is_xmm_fast_hypercall(&hc)) + if (hc.fast && is_xmm_fast_hypercall(&hc)) { + if (unlikely(hv_vcpu->enforce_cpuid && + !(hv_vcpu->cpuid_cache.features_edx & + HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + kvm_hv_hypercall_read_xmm(&hc); + } switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: -- cgit v1.2.3 From 179c6c27bf487273652efc99acd3ba512a23c137 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 3 Aug 2021 09:27:46 -0700 Subject: KVM: SVM: Fix off-by-one indexing when nullifying last used SEV VMCB Use the raw ASID, not ASID-1, when nullifying the last used VMCB when freeing an SEV ASID. The consumer, pre_sev_run(), indexes the array by the raw ASID, thus KVM could get a false negative when checking for a different VMCB if KVM manages to reallocate the same ASID+VMCB combo for a new VM. Note, this cannot cause a functional issue _in the current code_, as pre_sev_run() also checks which pCPU last did VMRUN for the vCPU, and last_vmentry_cpu is initialized to -1 during vCPU creation, i.e. is guaranteed to mismatch on the first VMRUN. However, prior to commit 8a14fe4f0c54 ("kvm: x86: Move last_cpu into kvm_vcpu_arch as last_vmentry_cpu"), SVM tracked pCPU on its own and zero-initialized the last_cpu variable. Thus it's theoretically possible that older versions of KVM could miss a TLB flush if the first VMRUN is on pCPU0 and the ASID and VMCB exactly match those of a prior VM. Fixes: 70cd94e60c73 ("KVM: SVM: VMRUN should use associated ASID when SEV is enabled") Cc: Tom Lendacky Cc: Brijesh Singh Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6710d9ee2e4b..4d0aba185412 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -189,7 +189,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); - sd->sev_vmcbs[pos] = NULL; + sd->sev_vmcbs[sev->asid] = NULL; } mutex_unlock(&sev_bitmap_lock); -- cgit v1.2.3 From f4b4b45652578357031fbbef7f7a1b04f6fa2dc3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Jul 2021 11:14:57 +0200 Subject: perf/x86: Fix out of bound MSR access On Wed, Jul 28, 2021 at 12:49:43PM -0400, Vince Weaver wrote: > [32694.087403] unchecked MSR access error: WRMSR to 0x318 (tried to write 0x0000000000000000) at rIP: 0xffffffff8106f854 (native_write_msr+0x4/0x20) > [32694.101374] Call Trace: > [32694.103974] perf_clear_dirty_counters+0x86/0x100 The problem being that it doesn't filter out all fake counters, in specific the above (erroneously) tries to use FIXED_BTS. Limit the fixed counters indexes to the hardware supplied number. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vince Weaver Tested-by: Like Xu Link: https://lkml.kernel.org/r/YQJxka3dxgdIdebG@hirez.programming.kicks-ass.net --- arch/x86/events/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1eb45139fcc6..3092fbf9dbe4 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2489,13 +2489,15 @@ void perf_clear_dirty_counters(void) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { - /* Metrics and fake events don't have corresponding HW counters. */ - if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR)) - continue; - else if (i >= INTEL_PMC_IDX_FIXED) + if (i >= INTEL_PMC_IDX_FIXED) { + /* Metrics and fake events don't have corresponding HW counters. */ + if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) + continue; + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); - else + } else { wrmsrl(x86_pmu_event_addr(i), 0); + } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); -- cgit v1.2.3 From df51fe7ea1c1c2c3bfdb81279712fdd2e4ea6c27 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 2 Aug 2021 15:08:50 +0800 Subject: perf/x86/amd: Don't touch the AMD64_EVENTSEL_HOSTONLY bit inside the guest If we use "perf record" in an AMD Milan guest, dmesg reports a #GP warning from an unchecked MSR access error on MSR_F15H_PERF_CTLx: [] unchecked MSR access error: WRMSR to 0xc0010200 (tried to write 0x0000020000110076) at rIP: 0xffffffff8106ddb4 (native_write_msr+0x4/0x20) [] Call Trace: [] amd_pmu_disable_event+0x22/0x90 [] x86_pmu_stop+0x4c/0xa0 [] x86_pmu_del+0x3a/0x140 The AMD64_EVENTSEL_HOSTONLY bit is defined and used on the host, while the guest perf driver should avoid such use. Fixes: 1018faa6cf23 ("perf/x86/kvm: Fix Host-Only/Guest-Only counting with SVM disabled") Signed-off-by: Like Xu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam Merwick Tested-by: Kim Phillips Tested-by: Liam Merwick Link: https://lkml.kernel.org/r/20210802070850.35295-1-likexu@tencent.com --- arch/x86/events/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2bf1c7ea2758..2938c902ffbe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1115,9 +1115,10 @@ void x86_pmu_stop(struct perf_event *event, int flags); static inline void x86_pmu_disable_event(struct perf_event *event) { + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config); + wrmsrl(hwc->config_base, hwc->config & ~disable_mask); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0); -- cgit v1.2.3 From bb2baeb214a71cda47d50dce80414016117ddda0 Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Mon, 2 Aug 2021 11:09:03 -0700 Subject: KVM: SVM: improve the code readability for ASID management KVM SEV code uses bitmaps to manage ASID states. ASID 0 was always skipped because it is never used by VM. Thus, in existing code, ASID value and its bitmap postion always has an 'offset-by-1' relationship. Both SEV and SEV-ES shares the ASID space, thus KVM uses a dynamic range [min_asid, max_asid] to handle SEV and SEV-ES ASIDs separately. Existing code mixes the usage of ASID value and its bitmap position by using the same variable called 'min_asid'. Fix the min_asid usage: ensure that its usage is consistent with its name; allocate extra size for ASID 0 to ensure that each ASID has the same value with its bitmap position. Add comments on ASID bitmap allocation to clarify the size change. Signed-off-by: Mingwei Zhang Cc: Tom Lendacky Cc: Marc Orr Cc: David Rientjes Cc: Alper Gun Cc: Dionna Glaze Cc: Sean Christopherson Cc: Vipin Sharma Cc: Peter Gonda Cc: Joerg Roedel Message-Id: <20210802180903.159381-1-mizhang@google.com> [Fix up sev_asid_free to also index by ASID, as suggested by Sean Christopherson, and use nr_asids in sev_cpu_init. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4d0aba185412..7fbce342eec4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -64,6 +64,7 @@ static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long sev_me_mask; +static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; @@ -78,11 +79,11 @@ struct enc_region { /* Called with the sev_bitmap_lock held, or on shutdown */ static int sev_flush_asids(int min_asid, int max_asid) { - int ret, pos, error = 0; + int ret, asid, error = 0; /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid); - if (pos >= max_asid) + asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); + if (asid > max_asid) return -EBUSY; /* @@ -115,15 +116,15 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, - max_sev_asid); - bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + nr_asids); + bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); return true; } static int sev_asid_new(struct kvm_sev_info *sev) { - int pos, min_asid, max_asid, ret; + int asid, min_asid, max_asid, ret; bool retry = true; enum misc_res_type type; @@ -143,11 +144,11 @@ static int sev_asid_new(struct kvm_sev_info *sev) * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. */ - min_asid = sev->es_active ? 0 : min_sev_asid - 1; + min_asid = sev->es_active ? 1 : min_sev_asid; max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); - if (pos >= max_asid) { + asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); + if (asid > max_asid) { if (retry && __sev_recycle_asids(min_asid, max_asid)) { retry = false; goto again; @@ -157,11 +158,11 @@ again: goto e_uncharge; } - __set_bit(pos, sev_asid_bitmap); + __set_bit(asid, sev_asid_bitmap); mutex_unlock(&sev_bitmap_lock); - return pos + 1; + return asid; e_uncharge: misc_cg_uncharge(type, sev->misc_cg, 1); put_misc_cg(sev->misc_cg); @@ -179,13 +180,12 @@ static int sev_get_asid(struct kvm *kvm) static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; - int cpu, pos; + int cpu; enum misc_res_type type; mutex_lock(&sev_bitmap_lock); - pos = sev->asid - 1; - __set_bit(pos, sev_reclaim_asid_bitmap); + __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); @@ -1857,12 +1857,17 @@ void __init sev_hardware_setup(void) min_sev_asid = edx; sev_me_mask = 1UL << (ebx & 0x3f); - /* Initialize SEV ASID bitmaps */ - sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + /* + * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, + * even though it's never used, so that the bitmap is indexed by the + * actual ASID. + */ + nr_asids = max_sev_asid + 1; + sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_asid_bitmap) goto out; - sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_reclaim_asid_bitmap) { bitmap_free(sev_asid_bitmap); sev_asid_bitmap = NULL; @@ -1907,7 +1912,7 @@ void sev_hardware_teardown(void) return; /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ - sev_flush_asids(0, max_sev_asid); + sev_flush_asids(1, max_sev_asid); bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); @@ -1921,7 +1926,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) if (!sev_enabled) return 0; - sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL); + sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); if (!sd->sev_vmcbs) return -ENOMEM; -- cgit v1.2.3 From d5aaad6f83420efb8357ac8e11c868708b22d0a9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 4 Aug 2021 14:46:09 -0700 Subject: KVM: x86/mmu: Fix per-cpu counter corruption on 32-bit builds Take a signed 'long' instead of an 'unsigned long' for the number of pages to add/subtract to the total number of pages used by the MMU. This fixes a zero-extension bug on 32-bit kernels that effectively corrupts the per-cpu counter used by the shrinker. Per-cpu counters take a signed 64-bit value on both 32-bit and 64-bit kernels, whereas kvm_mod_used_mmu_pages() takes an unsigned long and thus an unsigned 32-bit value on 32-bit kernels. As a result, the value used to adjust the per-cpu counter is zero-extended (unsigned -> signed), not sign-extended (signed -> signed), and so KVM's intended -1 gets morphed to 4294967295 and effectively corrupts the counter. This was found by a staggering amount of sheer dumb luck when running kvm-unit-tests on a 32-bit KVM build. The shrinker just happened to kick in while running tests and do_shrink_slab() logged an error about trying to free a negative number of objects. The truly lucky part is that the kernel just happened to be a slightly stale build, as the shrinker no longer yells about negative objects as of commit 18bb473e5031 ("mm: vmscan: shrink deferred objects proportional to priority"). vmscan: shrink_slab: mmu_shrink_scan+0x0/0x210 [kvm] negative objects to delete nr=-858993460 Fixes: bc8a3d8925a8 ("kvm: mmu: Fix overflow on kvm mmu page limit calculation") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210804214609.1096003-1-seanjc@google.com> Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 66f7f5bc3482..c4f4fa23320e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1644,7 +1644,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); -- cgit v1.2.3 From fa953adfad7cf9c7e30d9ea0e4ccfd38cfb5495d Mon Sep 17 00:00:00 2001 From: "H. Nikolaus Schaller" Date: Thu, 8 Jul 2021 10:57:09 +0200 Subject: x86/tools/relocs: Fix non-POSIX regexp Trying to run a cross-compiled x86 relocs tool on a BSD based HOSTCC leads to errors like VOFFSET arch/x86/boot/compressed/../voffset.h - due to: vmlinux CC arch/x86/boot/compressed/misc.o - due to: arch/x86/boot/compressed/../voffset.h OBJCOPY arch/x86/boot/compressed/vmlinux.bin - due to: vmlinux RELOCS arch/x86/boot/compressed/vmlinux.relocs - due to: vmlinux empty (sub)expressionarch/x86/boot/compressed/Makefile:118: recipe for target 'arch/x86/boot/compressed/vmlinux.relocs' failed make[3]: *** [arch/x86/boot/compressed/vmlinux.relocs] Error 1 It turns out that relocs.c uses patterns like "something(|_end)" This is not valid syntax or gives undefined results according to POSIX 9.5.3 ERE Grammar https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html It seems to be silently accepted by the Linux regexp() implementation while a BSD host complains. Such patterns can be replaced by a transformation like "(|p1|p2)" -> "(p1|p2)?" Fixes: fd952815307f ("x86-32, relocs: Whitelist more symbols for ld bug workaround") Signed-off-by: H. Nikolaus Schaller Signed-off-by: Masahiro Yamada --- arch/x86/tools/relocs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 04c5a44b9682..9ba700dc47de 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -57,12 +57,12 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { [S_REL] = "^(__init_(begin|end)|" "__x86_cpu_dev_(start|end)|" - "(__parainstructions|__alt_instructions)(|_end)|" - "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "(__parainstructions|__alt_instructions)(_end)?|" + "(__iommu_table|__apicdrivers|__smp_locks)(_end)?|" "__(start|end)_pci_.*|" "__(start|end)_builtin_fw|" - "__(start|stop)___ksymtab(|_gpl)|" - "__(start|stop)___kcrctab(|_gpl)|" + "__(start|stop)___ksymtab(_gpl)?|" + "__(start|stop)___kcrctab(_gpl)?|" "__(start|stop)___param|" "__(start|stop)___modver|" "__(start|stop)___bug_table|" -- cgit v1.2.3 From acade6379930dfa7987f4bd9b26d1a701cc1b542 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 3 Aug 2021 06:25:28 -0700 Subject: perf/x86/intel: Apply mid ACK for small core A warning as below may be occasionally triggered in an ADL machine when these conditions occur: - Two perf record commands run one by one. Both record a PEBS event. - Both runs on small cores. - They have different adaptive PEBS configuration (PEBS_DATA_CFG). [ ] WARNING: CPU: 4 PID: 9874 at arch/x86/events/intel/ds.c:1743 setup_pebs_adaptive_sample_data+0x55e/0x5b0 [ ] RIP: 0010:setup_pebs_adaptive_sample_data+0x55e/0x5b0 [ ] Call Trace: [ ] [ ] intel_pmu_drain_pebs_icl+0x48b/0x810 [ ] perf_event_nmi_handler+0x41/0x80 [ ] [ ] __perf_event_task_sched_in+0x2c2/0x3a0 Different from the big core, the small core requires the ACK right before re-enabling counters in the NMI handler, otherwise a stale PEBS record may be dumped into the later NMI handler, which trigger the warning. Add a new mid_ack flag to track the case. Add all PMI handler bits in the struct x86_hybrid_pmu to track the bits for different types of PMUs. Apply mid ACK for the small cores on an Alder Lake machine. The existing hybrid() macro has a compile error when taking address of a bit-field variable. Add a new macro hybrid_bit() to get the bit-field value of a given PMU. Fixes: f83d2f91d259 ("perf/x86/intel: Add Alder Lake Hybrid support") Reported-by: Ammy Yi Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Tested-by: Ammy Yi Link: https://lkml.kernel.org/r/1627997128-57891-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 23 +++++++++++++++-------- arch/x86/events/perf_event.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fca7a6e2242f..ac6fd2dabf6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2904,24 +2904,28 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ static int intel_pmu_handle_irq(struct pt_regs *regs) { - struct cpu_hw_events *cpuc; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + bool late_ack = hybrid_bit(cpuc->pmu, late_ack); + bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack); int loops; u64 status; int handled; int pmu_enabled; - cpuc = this_cpu_ptr(&cpu_hw_events); - /* * Save the PMU state. * It needs to be restored when leaving the handler. */ pmu_enabled = cpuc->enabled; /* - * No known reason to not always do late ACK, - * but just in case do it opt-in. + * In general, the early ACK is only applied for old platforms. + * For the big core starts from Haswell, the late ACK should be + * applied. + * For the small core after Tremont, we have to do the ACK right + * before re-enabling counters, which is in the middle of the + * NMI handler. */ - if (!x86_pmu.late_ack) + if (!late_ack && !mid_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_bts_disable_local(); cpuc->enabled = 0; @@ -2958,6 +2962,8 @@ again: goto again; done: + if (mid_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); /* Only restore PMU state when it's active. See x86_pmu_disable(). */ cpuc->enabled = pmu_enabled; if (pmu_enabled) @@ -2969,7 +2975,7 @@ done: * have been reset. This avoids spurious NMIs on * Haswell CPUs. */ - if (x86_pmu.late_ack) + if (late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -6129,7 +6135,6 @@ __init int intel_pmu_init(void) static_branch_enable(&perf_is_hybrid); x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; - x86_pmu.late_ack = true; x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.pebs_block = true; @@ -6167,6 +6172,7 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; pmu->name = "cpu_core"; pmu->cpu_type = hybrid_big; + pmu->late_ack = true; if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { pmu->num_counters = x86_pmu.num_counters + 2; pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; @@ -6192,6 +6198,7 @@ __init int intel_pmu_init(void) pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; pmu->name = "cpu_atom"; pmu->cpu_type = hybrid_small; + pmu->mid_ack = true; pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; pmu->max_pebs_events = x86_pmu.max_pebs_events; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2938c902ffbe..e3ac05c97b5e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -656,6 +656,10 @@ struct x86_hybrid_pmu { struct event_constraint *event_constraints; struct event_constraint *pebs_constraints; struct extra_reg *extra_regs; + + unsigned int late_ack :1, + mid_ack :1, + enabled_ack :1; }; static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) @@ -686,6 +690,16 @@ extern struct static_key_false perf_is_hybrid; __Fp; \ })) +#define hybrid_bit(_pmu, _field) \ +({ \ + bool __Fp = x86_pmu._field; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = hybrid_pmu(_pmu)->_field; \ + \ + __Fp; \ +}) + enum hybrid_pmu_type { hybrid_big = 0x40, hybrid_small = 0x20, @@ -755,6 +769,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, + mid_ack :1, enabled_ack :1; /* * sysfs attrs -- cgit v1.2.3 From 0c0e37dc11671384e53ba6ede53a4d91162a2cc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:49 +0200 Subject: x86/ioapic: Force affinity setup before startup The IO/APIC cannot handle interrupt affinity changes safely after startup other than from an interrupt handler. The startup sequence in the generic interrupt code violates that assumption. Mark the irq chip with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.832143400@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5c691a3208b..39224e035e47 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1986,7 +1986,8 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct irq_chip ioapic_ir_chip __read_mostly = { @@ -1999,7 +2000,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static inline void init_IO_APIC_traps(void) -- cgit v1.2.3 From ff363f480e5997051dd1de949121ffda3b753741 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:50 +0200 Subject: x86/msi: Force affinity setup before startup The X86 MSI mechanism cannot handle interrupt affinity changes safely after startup other than from an interrupt handler, unless interrupt remapping is enabled. The startup sequence in the generic interrupt code violates that assumption. Mark the irq chips with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time. While the interrupt remapping MSI chip does not require this, there is no point in treating it differently as this might spare an interrupt to a CPU which is not in the default affinity mask. For the non-remapping case go to the direct write path when the interrupt is not yet started similar to the not yet activated case. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.886722080@linutronix.de --- arch/x86/kernel/apic/msi.c | 11 ++++++++--- arch/x86/kernel/hpet.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 44ebe25e7703..dbacb9ec8843 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -58,11 +58,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * The quirk bit is not set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ if (!irqd_msi_nomask_quirk(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || cfg->dest_apicid == old_cfg.dest_apicid) { irq_msi_update_msg(irqd, cfg); return ret; @@ -150,7 +152,8 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, @@ -219,7 +222,8 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct msi_domain_info pci_msi_ir_domain_info = { @@ -273,7 +277,8 @@ static struct irq_chip dmar_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static int dmar_msi_init(struct irq_domain *domain, diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 08651a4e6aa0..42fc41dd0e1f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -508,7 +508,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = { .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, }; static int hpet_msi_init(struct irq_domain *domain, -- cgit v1.2.3 From 7b9cae027ba3aaac295ae23a62f47876ed97da73 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 10:19:49 -0700 Subject: KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation Use the secondary_exec_controls_get() accessor in vmx_has_waitpkg() to effectively get the controls for the current VMCS, as opposed to using vmx->secondary_exec_controls, which is the cached value of KVM's desired controls for vmcs01 and truly not reflective of any particular VMCS. While the waitpkg control is not dynamic, i.e. vmcs01 will always hold the same waitpkg configuration as vmx->secondary_exec_controls, the same does not hold true for vmcs02 if the L1 VMM hides the feature from L2. If L1 hides the feature _and_ does not intercept MSR_IA32_UMWAIT_CONTROL, L2 could incorrectly read/write L1's virtual MSR instead of taking a #GP. Fixes: 6e3ba4abcea5 ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210810171952.2758100-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db88ed4f2121..17a1cb4b059d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -522,7 +522,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow) static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { - return vmx->secondary_exec_control & + return secondary_exec_controls_get(vmx) & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } -- cgit v1.2.3 From 839ad22f755132838f406751439363c07272ad87 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 30 Jul 2021 17:01:46 -0700 Subject: x86/tools: Fix objdump version check again Skip (omit) any version string info that is parenthesized. Warning: objdump version 15) is older than 2.19 Warning: Skipping posttest. where 'objdump -v' says: GNU objdump (GNU Binutils; SUSE Linux Enterprise 15) 2.35.1.20201123-7.18 Fixes: 8bee738bb1979 ("x86: Fix objdump version check in chkobjdump.awk for different formats.") Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20210731000146.2720-1-rdunlap@infradead.org --- arch/x86/tools/chkobjdump.awk | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index fd1ab80be0de..a4cf678cf5c8 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -10,6 +10,7 @@ BEGIN { /^GNU objdump/ { verstr = "" + gsub(/\(.*\)/, ""); for (i = 3; i <= NF; i++) if (match($(i), "^[0-9]")) { verstr = $(i); -- cgit v1.2.3 From 064855a69003c24bd6b473b367d364e418c57625 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 2 Aug 2021 14:38:58 -0500 Subject: x86/resctrl: Fix default monitoring groups reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creating a new sub monitoring group in the root /sys/fs/resctrl leads to getting the "Unavailable" value for mbm_total_bytes and mbm_local_bytes on the entire filesystem. Steps to reproduce: 1. mount -t resctrl resctrl /sys/fs/resctrl/ 2. cd /sys/fs/resctrl/ 3. cat mon_data/mon_L3_00/mbm_total_bytes 23189832 4. Create sub monitor group: mkdir mon_groups/test1 5. cat mon_data/mon_L3_00/mbm_total_bytes Unavailable When a new monitoring group is created, a new RMID is assigned to the new group. But the RMID is not active yet. When the events are read on the new RMID, it is expected to report the status as "Unavailable". When the user reads the events on the default monitoring group with multiple subgroups, the events on all subgroups are consolidated together. Currently, if any of the RMID reads report as "Unavailable", then everything will be reported as "Unavailable". Fix the issue by discarding the "Unavailable" reads and reporting all the successful RMID reads. This is not a problem on Intel systems as Intel reports 0 on Inactive RMIDs. Fixes: d89b7379015f ("x86/intel_rdt/cqm: Add mon_data") Reported-by: Paweł Szulik Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Acked-by: Reinette Chatre Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=213311 Link: https://lkml.kernel.org/r/162793309296.9224.15871659871696482080.stgit@bmoger-ubuntu --- arch/x86/kernel/cpu/resctrl/monitor.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f07c10b87a87..57e4bb695ff9 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -285,15 +285,14 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >>= shift; } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 chunks, tval; tval = __rmid_read(rmid, rr->evtid); if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - rr->val = tval; - return -EINVAL; + return tval; } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: @@ -305,12 +304,6 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) case QOS_L3_MBM_LOCAL_EVENT_ID: m = &rr->d->mbm_local[rmid]; break; - default: - /* - * Code would never reach here because - * an invalid event id would fail the __rmid_read. - */ - return -EINVAL; } if (rr->first) { @@ -361,23 +354,29 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; + u64 ret_val; rdtgrp = rr->rgrp; - if (__mon_event_count(rdtgrp->mon.rmid, rr)) - return; + ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); /* - * For Ctrl groups read data from child monitor groups. + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. */ head = &rdtgrp->mon.crdtgrp_list; if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr)) - return; + if (__mon_event_count(entry->mon.rmid, rr) == 0) + ret_val = 0; } } + + /* Report error if none of rmid_reads are successful */ + if (ret_val) + rr->val = ret_val; } /* -- cgit v1.2.3 From 1383279c6494c6b62d1d6939f34906a4d2ef721c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Aug 2021 11:38:04 -0700 Subject: KVM: x86: Allow guest to set EFER.NX=1 on non-PAE 32-bit kernels Remove an ancient restriction that disallowed exposing EFER.NX to the guest if EFER.NX=0 on the host, even if NX is fully supported by the CPU. The motivation of the check, added by commit 2cc51560aed0 ("KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit"), was to rule out the case of host.EFER.NX=0 and guest.EFER.NX=1 so that KVM could run the guest with the host's EFER.NX and thus avoid context switching EFER if the only divergence was the NX bit. Fast forward to today, and KVM has long since stopped running the guest with the host's EFER.NX. Not only does KVM context switch EFER if host.EFER.NX=1 && guest.EFER.NX=0, KVM also forces host.EFER.NX=0 && guest.EFER.NX=1 when using shadow paging (to emulate SMEP). Furthermore, the entire motivation for the restriction was made obsolete over a decade ago when Intel added dedicated host and guest EFER fields in the VMCS (Nehalem timeframe), which reduced the overhead of context switching EFER from 400+ cycles (2 * WRMSR + 1 * RDMSR) to a mere ~2 cycles. In practice, the removed restriction only affects non-PAE 32-bit kernels, as EFER.NX is set during boot if NX is supported and the kernel will use PAE paging (32-bit or 64-bit), regardless of whether or not the kernel will actually use NX itself (mark PTEs non-executable). Alternatively and/or complementarily, startup_32_smp() in head_32.S could be modified to set EFER.NX=1 regardless of paging mode, thus eliminating the scenario where NX is supported but not enabled. However, that runs the risk of breaking non-KVM non-PAE kernels (though the risk is very, very low as there are no known EFER.NX errata), and also eliminates an easy-to-use mechanism for stressing KVM's handling of guest vs. host EFER across nested virtualization transitions. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20210805183804.1221554-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 739be5da3bca..fe03bd978761 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_after_set_cpuid(vcpu); } -static int is_efer_nx(void) -{ - return host_efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { - cpuid_entry_clear(entry, X86_FEATURE_NX); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) void kvm_set_cpu_caps(void) { - unsigned int f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); @@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void) F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) ); -- cgit v1.2.3 From ffbe17cadaf564b5da0e4eabdcff1b719e184a76 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 9 Aug 2021 07:00:58 -0400 Subject: KVM: x86: remove dead initialization hv_vcpu is initialized again a dozen lines below, and at this point vcpu->arch.hyperv is not valid. Remove the initializer. Reported-by: kernel test robot Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0b38f944c6b6..41d2a53c5dea 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1933,7 +1933,7 @@ ret_success: void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { -- cgit v1.2.3 From 85aa8889b82e0eec680a21ea28dbf57c6acfe182 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 6 Aug 2021 15:22:29 -0700 Subject: kvm: vmx: Sync all matching EPTPs when injecting nested EPT fault When a nested EPT violation/misconfig is injected into the guest, the shadow EPT PTEs associated with that address need to be synced. This is done by kvm_inject_emulated_page_fault() before it calls nested_ept_inject_page_fault(). However, that will only sync the shadow EPT PTE associated with the current L1 EPTP. Since the ASID is based on EP4TA rather than the full EPTP, so syncing the current EPTP is not enough. The SPTEs associated with any other L1 EPTPs in the prev_roots cache with the same EP4TA also need to be synced. Signed-off-by: Junaid Shahid Message-Id: <20210806222229.1645356-1-junaids@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 53 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..81160ebe6acb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, + gpa_t addr) +{ + uint i; + struct kvm_mmu_root_info *cached_root; + + WARN_ON_ONCE(!mmu_is_nested(vcpu)); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + cached_root = &vcpu->arch.mmu->prev_roots[i]; + + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, + eptp)) + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + } +} + static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } else { + if (fault->error_code & PFERR_RSVD_MASK) + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + + /* + * Although the caller (kvm_inject_emulated_page_fault) would + * have already synced the faulting address in the shadow EPT + * tables for the current EPTP12, we also need to sync it for + * any other cached EPTP02s based on the same EP4TA, since the + * TLB associates mappings to the EP4TA rather than the full EPTP. + */ + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, + fault->address); + } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; @@ -5325,14 +5362,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -#define EPTP_PA_MASK GENMASK_ULL(51, 12) - -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) -{ - return VALID_PAGE(root_hpa) && - ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); -} - /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { -- cgit v1.2.3 From 18712c13709d2de9516c5d3414f707c4f0a9c190 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Aug 2021 21:56:15 -0700 Subject: KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF Use vmx_need_pf_intercept() when determining if L0 wants to handle a #PF in L2 or if the VM-Exit should be forwarded to L1. The current logic fails to account for the case where #PF is intercepted to handle guest.MAXPHYADDR < host.MAXPHYADDR and ends up reflecting all #PFs into L1. At best, L1 will complain and inject the #PF back into L2. At worst, L1 will eat the unexpected fault and cause L2 to hang on infinite page faults. Note, while the bug was technically introduced by the commit that added support for the MAXPHYADDR madness, the shame is all on commit a0c134347baf ("KVM: VMX: introduce vmx_need_pf_intercept"). Fixes: 1dbf5d68af6f ("KVM: VMX: Add guest physical address check in EPT violation and misconfig") Cc: stable@vger.kernel.org Cc: Peter Shier Cc: Oliver Upton Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20210812045615.3167686-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 81160ebe6acb..b3f77d18eb5a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5855,7 +5855,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) - return vcpu->arch.apf.host_apf_flags || !enable_ept; + return vcpu->arch.apf.host_apf_flags || + vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) -- cgit v1.2.3 From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:14:13 -0700 Subject: KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and really zap all SPTEs in this case. As is, zap_gfn_range() skips non-leaf SPTEs whose range exceeds the range to be zapped. If shadow_phys_bits is not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level paging, the "zap all" flows will skip top-level SPTEs whose range extends beyond shadow_phys_bits and leak their SPs when the VM is destroyed. Use the current upper bound (based on host.MAXPHYADDR) to detect that the caller wants to zap all SPTEs, e.g. instead of using the max theoretical gfn, 1 << (52 - 12). The more precise upper bound allows the TDP iterator to terminate its walk earlier when running on hosts with MAXPHYADDR < 52. Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to help future debuggers should KVM decide to leak SPTEs again. The bug is most easily reproduced by running (and unloading!) KVM in a VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped. ============================================================================= BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown() ----------------------------------------------------------------------------- Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1) CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x45/0x59 slab_err+0x95/0xc9 __kmem_cache_shutdown.cold+0x3c/0x158 kmem_cache_destroy+0x3d/0xf0 kvm_mmu_module_exit+0xa/0x30 [kvm] kvm_arch_exit+0x5d/0x90 [kvm] kvm_exit+0x78/0x90 [kvm] vmx_exit+0x1a/0x50 [kvm_intel] __x64_sys_delete_module+0x13f/0x220 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181414.3376143-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..8783b9eb2b33 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - kvm_lockdep_assert_mmu_lock_held(kvm, shared); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) @@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); + zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -724,8 +723,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush, bool shared) { + gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will + * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, + * and so KVM will never install a SPTE for such addresses. + */ + end = min(end, max_gfn_host); + kvm_lockdep_assert_mmu_lock_held(kvm, shared); rcu_read_lock(); @@ -744,9 +752,10 @@ retry: /* * If this is a non-last-level SPTE that covers a larger range * than should be zapped, continue, and zap the mappings at a - * lower level. + * lower level, except when zapping all SPTEs. */ - if ((iter.gfn < start || + if (!zap_all && + (iter.gfn < start || iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && !is_last_spte(iter.old_spte, iter.level)) continue; @@ -794,12 +803,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); bool flush = false; int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush, false); if (flush) @@ -838,7 +846,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); struct kvm_mmu_page *next_root; struct kvm_mmu_page *root; bool flush = false; @@ -854,8 +861,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) rcu_read_unlock(); - flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, - true); + flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); /* * Put the reference acquired in -- cgit v1.2.3 From 0103098fb4f13b447b26ed514bcd3140f6791047 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:14:14 -0700 Subject: KVM: x86/mmu: Don't step down in the TDP iterator when zapping all SPTEs Set the min_level for the TDP iterator at the root level when zapping all SPTEs to optimize the iterator's try_step_down(). Zapping a non-leaf SPTE will recursively zap all its children, thus there is no need for the iterator to attempt to step down. This avoids rereading the top-level SPTEs after they are zapped by causing try_step_down() to short-circuit. In most cases, optimizing try_step_down() will be in the noise as the cost of zapping SPTEs completely dominates the overall time. The optimization is however helpful if the zap occurs with relatively few SPTEs, e.g. if KVM is zapping in response to multiple memslot updates when userspace is adding and removing read-only memslots for option ROMs. In that case, the task doing the zapping likely isn't a vCPU thread, but it still holds mmu_lock for read and thus can be a noisy neighbor of sorts. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181414.3376143-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8783b9eb2b33..d80cb122b5f3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -727,6 +727,12 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * No need to try to step down in the iterator when zapping all SPTEs, + * zapping the top-level non-leaf SPTEs will recurse on their children. + */ + int min_level = zap_all ? root->role.level : PG_LEVEL_4K; + /* * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, @@ -738,7 +744,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { -- cgit v1.2.3 From ce25681d59ffc4303321e555a2d71b1946af07da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:18:15 -0700 Subject: KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock Add yet another spinlock for the TDP MMU and take it when marking indirect shadow pages unsync. When using the TDP MMU and L1 is running L2(s) with nested TDP, KVM may encounter shadow pages for the TDP entries managed by L1 (controlling L2) when handling a TDP MMU page fault. The unsync logic is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and misbehaves when a shadow page is marked unsync via a TDP MMU page fault, which runs with mmu_lock held for read, not write. Lack of a critical section manifests most visibly as an underflow of unsync_children in clear_unsync_child_bit() due to unsync_children being corrupted when multiple CPUs write it without a critical section and without atomic operations. But underflow is the best case scenario. The worst case scenario is that unsync_children prematurely hits '0' and leads to guest memory corruption due to KVM neglecting to properly sync shadow pages. Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock would functionally be ok. Usurping the lock could degrade performance when building upper level page tables on different vCPUs, especially since the unsync flow could hold the lock for a comparatively long time depending on the number of indirect shadow pages and the depth of the paging tree. For simplicity, take the lock for all MMUs, even though KVM could fairly easily know that mmu_lock is held for write. If mmu_lock is held for write, there cannot be contention for the inner spinlock, and marking shadow pages unsync across multiple vCPUs will be slow enough that bouncing the kvm_arch cacheline should be in the noise. Note, even though L2 could theoretically be given access to its own EPT entries, a nested MMU must hold mmu_lock for write and thus cannot race against a TDP MMU page fault. I.e. the additional spinlock only _needs_ to be taken by the TDP MMU, as opposed to being taken by any MMU for a VM that is running with the TDP MMU enabled. Holding mmu_lock for read also prevents the indirect shadow page from being freed. But as above, keep it simple and always take the lock. Alternative #1, the TDP MMU could simply pass "false" for can_unsync and effectively disable unsync behavior for nested TDP. Write protecting leaf shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such VMMs typically don't modify TDP entries, but the same may not hold true for non-standard use cases and/or VMMs that are migrating physical pages (from L1's perspective). Alternative #2, the unsync logic could be made thread safe. In theory, simply converting all relevant kvm_mmu_page fields to atomics and using atomic bitops for the bitmap would suffice. However, (a) an in-depth audit would be required, (b) the code churn would be substantial, and (c) legacy shadow paging would incur additional atomic operations in performance sensitive paths for no benefit (to legacy shadow paging). Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181815.3378104-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 8 ++++---- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 35eca377543d..88fa495abbac 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -25,10 +25,10 @@ On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock -- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is - taken inside kvm->arch.mmu_lock, and cannot be taken without already - holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise - there's no need to take kvm->arch.tdp_mmu_pages_lock at all). +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and + kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and + cannot be taken without already holding kvm->arch.mmu_lock (typically with + ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 974cbfb1eefe..af6ce8d4c86a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1038,6 +1038,13 @@ struct kvm_arch { struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; + /* + * Protects marking pages unsync during page faults, as TDP MMU page + * faults only take mmu_lock for read. For simplicity, the unsync + * pages lock is always taken when marking pages unsync regardless of + * whether mmu_lock is held for read or write. + */ + spinlock_t mmu_unsync_pages_lock; struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c4f4fa23320e..47b765270239 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page @@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + if (!kvm_mmu_init_tdp_mmu(kvm)) /* * No smp_load/store wrappers needed here as we are in -- cgit v1.2.3 From 0f923e07124df069ba68d8bb12324398f4b6b709 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 15 Jul 2021 01:56:24 +0300 Subject: KVM: nSVM: avoid picking up unsupported bits from L2 in int_ctl (CVE-2021-3653) * Invert the mask of bits that we pick from L2 in nested_vmcb02_prepare_control * Invert and explicitly use VIRQ related bits bitmask in svm_clear_vintr This fixes a security issue that allowed a malicious L1 to run L2 with AVIC enabled, which allowed the L2 to exploit the uninitialized and enabled AVIC to read/write the host physical memory at some offsets. Fixes: 3d6368ef580a ("KVM: SVM: Add VMRUN handler") Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 2 ++ arch/x86/kvm/svm/nested.c | 10 +++++++--- arch/x86/kvm/svm/svm.c | 9 +++++---- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e322676039f4..b00dbc5fac2b 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -184,6 +184,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_IGN_TPR_SHIFT 20 #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) +#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK) + #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 61738ff8ef33..28381ca5221c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -503,7 +503,11 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { - const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + const u32 int_ctl_vmcb01_bits = + V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; + + const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + struct kvm_vcpu *vcpu = &svm->vcpu; /* @@ -535,8 +539,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; svm->vmcb->control.int_ctl = - (svm->nested.ctl.int_ctl & ~mask) | - (svm->vmcb01.ptr->control.int_ctl & mask); + (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | + (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8ccab50ebf6..69639f9624f5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1589,17 +1589,18 @@ static void svm_set_vintr(struct vcpu_svm *svm) static void svm_clear_vintr(struct vcpu_svm *svm) { - const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ - svm->vmcb->control.int_ctl &= mask; + svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; if (is_guest_mode(&svm->vcpu)) { - svm->vmcb01.ptr->control.int_ctl &= mask; + svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); - svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; + + svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & + V_IRQ_INJECTION_BITS_MASK; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); -- cgit v1.2.3 From c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 19 Jul 2021 16:05:00 +0300 Subject: KVM: nSVM: always intercept VMLOAD/VMSAVE when nested (CVE-2021-3656) If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor), then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only possible by making L0 intercept these instructions. Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted, and thus read/write portions of the host physical memory. Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature") Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 28381ca5221c..e5515477c30a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -158,6 +158,9 @@ void recalc_intercepts(struct vcpu_svm *svm) /* If SMI is not intercepted, ignore guest SMI intercept as well */ if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); + + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); } static void copy_vmcb_control_area(struct vmcb_control_area *dst, -- cgit v1.2.3 From c53c6b7409f4cd9e542991b53d597fbe2751d7db Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Tue, 24 Aug 2021 12:06:22 +0800 Subject: perf/x86/intel/pt: Fix mask of num_address_ranges Per SDM, bit 2:0 of CPUID(0x14,1).EAX[2:0] reports the number of configurable address ranges for filtering, not bit 1:0. Signed-off-by: Xiaoyao Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Link: https://lkml.kernel.org/r/20210824040622.4081502-1-xiaoyao.li@intel.com --- arch/x86/events/intel/pt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 915847655c06..b044577785bb 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -62,7 +62,7 @@ static struct pt_cap_desc { PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), -- cgit v1.2.3 From 0b3a8738b76fe2087f7bc2bd59f4c78504c79180 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 6 Jul 2021 12:45:53 +0100 Subject: perf/x86/intel/uncore: Fix integer overflow on 23 bit left shift of a u32 The u32 variable pci_dword is being masked with 0x1fffffff and then left shifted 23 places. The shift is a u32 operation,so a value of 0x200 or more in pci_dword will overflow the u32 and only the bottow 32 bits are assigned to addr. I don't believe this was the original intent. Fix this by casting pci_dword to a resource_size_t to ensure no overflow occurs. Note that the mask and 12 bit left shift operation does not need this because the mask SNR_IMC_MMIO_MEM0_MASK and shift is always a 32 bit value. Fixes: ee49532b38dd ("perf/x86/intel/uncore: Add IMC uncore support for Snow Ridge") Addresses-Coverity: ("Unintentional integer overflow") Signed-off-by: Colin Ian King Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20210706114553.28249-1-colin.king@canonical.com --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 609c24aec71a..c682b09b18fa 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -4811,7 +4811,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, return; pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); - addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; + addr = ((resource_size_t)pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; pci_read_config_dword(pdev, mem_offset, &pci_dword); addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; -- cgit v1.2.3 From 26db2e0c51fe83e1dd852c1321407835b481806e Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:42 -0500 Subject: perf/x86/amd/ibs: Work around erratum #1197 Erratum #1197 "IBS (Instruction Based Sampling) Register State May be Incorrect After Restore From CC6" is published in a document: "Revision Guide for AMD Family 19h Models 00h-0Fh Processors" 56683 Rev. 1.04 July 2021 https://bugzilla.kernel.org/show_bug.cgi?id=206537 Implement the erratum's suggested workaround and ignore IBS samples if MSRC001_1031 == 0. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-3-kim.phillips@amd.com --- arch/x86/events/amd/ibs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 40669eac9d6d..921f47b9bb24 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -90,6 +90,7 @@ struct perf_ibs { unsigned long offset_mask[1]; int offset_max; unsigned int fetch_count_reset_broken : 1; + unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; struct attribute **format_attrs; @@ -672,6 +673,10 @@ fail: if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { + /* Workaround for erratum #1197 */ + if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) + goto out; + set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } @@ -769,6 +774,9 @@ static __init void perf_event_ibs_init(void) if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) perf_ibs_fetch.fetch_count_reset_broken = 1; + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) + perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ibs_caps & IBS_CAPS_OPCNT) { -- cgit v1.2.3 From f11dd0d80555cdc8eaf5cfc9e19c9e198217f9f1 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:41 -0500 Subject: perf/x86/amd/ibs: Extend PERF_PMU_CAP_NO_EXCLUDE to IBS Op Commit: 2ff40250691e ("perf/core, arch/x86: Use PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs") neglected to do so. Fixes: 2ff40250691e ("perf/core, arch/x86: Use PERF_PMU_CAP_NO_EXCLUDE for exclusion incapable PMUs") Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210817221048.88063-2-kim.phillips@amd.com --- arch/x86/events/amd/ibs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 921f47b9bb24..ccc9ee1971e8 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -571,6 +571,7 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, -- cgit v1.2.3 From ccf26483416a339c114409f6e7cd02abdeaf8052 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 17 Aug 2021 17:10:43 -0500 Subject: perf/x86/amd/power: Assign pmu.module Assign pmu.module so the driver can't be unloaded whilst in use. Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210817221048.88063-4-kim.phillips@amd.com --- arch/x86/events/amd/power.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 16a2369c586e..37d5b380516e 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -213,6 +213,7 @@ static struct pmu pmu_class = { .stop = pmu_event_stop, .read = pmu_event_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, }; static int power_cpu_exit(unsigned int cpu) -- cgit v1.2.3