diff options
author | Jakub Kicinski <kuba@kernel.org> | 2022-11-03 21:38:42 +0300 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2022-11-03 23:21:54 +0300 |
commit | fbeb229a6622523c092a13c02bd0e15f69240dde (patch) | |
tree | a48bf5ff455cedae6555ef071e0d8bdb29487824 /arch | |
parent | d9095f92950bd16745b9ec24ebebc12d14b3a3e8 (diff) | |
parent | 9521c9d6a53df9c44a5f5ddbc229ceaf3cf79ef6 (diff) | |
download | linux-fbeb229a6622523c092a13c02bd0e15f69240dde.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
No conflicts.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'arch')
49 files changed, 518 insertions, 223 deletions
diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h index 6954dc5d24e9..7184f1dc61f2 100644 --- a/arch/loongarch/include/asm/processor.h +++ b/arch/loongarch/include/asm/processor.h @@ -191,7 +191,7 @@ static inline void flush_thread(void) unsigned long __get_wchan(struct task_struct *p); #define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \ - THREAD_SIZE - 32 - sizeof(struct pt_regs)) + THREAD_SIZE - sizeof(struct pt_regs)) #define task_pt_regs(tsk) ((struct pt_regs *)__KSTK_TOS(tsk)) #define KSTK_EIP(tsk) (task_pt_regs(tsk)->csr_era) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->regs[3]) diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index 17838c6b7ccd..59c4608de91d 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -29,7 +29,7 @@ struct pt_regs { unsigned long csr_euen; unsigned long csr_ecfg; unsigned long csr_estat; - unsigned long __last[0]; + unsigned long __last[]; } __aligned(8); static inline int regs_irqs_disabled(struct pt_regs *regs) @@ -133,7 +133,7 @@ static inline void die_if_kernel(const char *str, struct pt_regs *regs) #define current_pt_regs() \ ({ \ unsigned long sp = (unsigned long)__builtin_frame_address(0); \ - (struct pt_regs *)((sp | (THREAD_SIZE - 1)) + 1 - 32) - 1; \ + (struct pt_regs *)((sp | (THREAD_SIZE - 1)) + 1) - 1; \ }) /* Helpers for working with the user stack pointer */ diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S index 97425779ce9f..84970e266658 100644 --- a/arch/loongarch/kernel/head.S +++ b/arch/loongarch/kernel/head.S @@ -84,10 +84,9 @@ SYM_CODE_START(kernel_entry) # kernel entry point la.pcrel tp, init_thread_union /* Set the SP after an empty pt_regs. */ - PTR_LI sp, (_THREAD_SIZE - 32 - PT_SIZE) + PTR_LI sp, (_THREAD_SIZE - PT_SIZE) PTR_ADD sp, sp, tp set_saved_sp sp, t0, t1 - PTR_ADDI sp, sp, -4 * SZREG # init stack pointer bl start_kernel ASM_BUG() diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index 1256e3582475..2526b68f1c0f 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -129,7 +129,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long clone_flags = args->flags; struct pt_regs *childregs, *regs = current_pt_regs(); - childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32; + childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE; /* set up new TSS. */ childregs = (struct pt_regs *) childksp - 1; @@ -236,7 +236,7 @@ bool in_task_stack(unsigned long stack, struct task_struct *task, struct stack_info *info) { unsigned long begin = (unsigned long)task_stack_page(task); - unsigned long end = begin + THREAD_SIZE - 32; + unsigned long end = begin + THREAD_SIZE; if (stack < begin || stack >= end) return false; diff --git a/arch/loongarch/kernel/switch.S b/arch/loongarch/kernel/switch.S index 43ebbc3990f7..202a163cb32f 100644 --- a/arch/loongarch/kernel/switch.S +++ b/arch/loongarch/kernel/switch.S @@ -26,7 +26,7 @@ SYM_FUNC_START(__switch_to) move tp, a2 cpu_restore_nonscratch a1 - li.w t0, _THREAD_SIZE - 32 + li.w t0, _THREAD_SIZE PTR_ADD t0, t0, tp set_saved_sp t0, t1, t2 diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index 43f0a98efe38..bdcd0c7719a9 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -279,6 +279,7 @@ static void emit_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) const u8 t1 = LOONGARCH_GPR_T1; const u8 t2 = LOONGARCH_GPR_T2; const u8 t3 = LOONGARCH_GPR_T3; + const u8 r0 = regmap[BPF_REG_0]; const u8 src = regmap[insn->src_reg]; const u8 dst = regmap[insn->dst_reg]; const s16 off = insn->off; @@ -359,8 +360,6 @@ static void emit_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) break; /* r0 = atomic_cmpxchg(dst + off, r0, src); */ case BPF_CMPXCHG: - u8 r0 = regmap[BPF_REG_0]; - move_reg(ctx, t2, r0); if (isdw) { emit_insn(ctx, lld, r0, t1, 0); @@ -390,8 +389,11 @@ static bool is_signed_bpf_cond(u8 cond) static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool extra_pass) { - const bool is32 = BPF_CLASS(insn->code) == BPF_ALU || - BPF_CLASS(insn->code) == BPF_JMP32; + u8 tm = -1; + u64 func_addr; + bool func_addr_fixed; + int i = insn - ctx->prog->insnsi; + int ret, jmp_offset; const u8 code = insn->code; const u8 cond = BPF_OP(code); const u8 t1 = LOONGARCH_GPR_T1; @@ -400,8 +402,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext const u8 dst = regmap[insn->dst_reg]; const s16 off = insn->off; const s32 imm = insn->imm; - int jmp_offset; - int i = insn - ctx->prog->insnsi; + const u64 imm64 = (u64)(insn + 1)->imm << 32 | (u32)insn->imm; + const bool is32 = BPF_CLASS(insn->code) == BPF_ALU || BPF_CLASS(insn->code) == BPF_JMP32; switch (code) { /* dst = src */ @@ -724,24 +726,23 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: - u8 t7 = -1; jmp_offset = bpf2la_offset(i, off, ctx); if (imm) { move_imm(ctx, t1, imm, false); - t7 = t1; + tm = t1; } else { /* If imm is 0, simply use zero register. */ - t7 = LOONGARCH_GPR_ZERO; + tm = LOONGARCH_GPR_ZERO; } move_reg(ctx, t2, dst); if (is_signed_bpf_cond(BPF_OP(code))) { - emit_sext_32(ctx, t7, is32); + emit_sext_32(ctx, tm, is32); emit_sext_32(ctx, t2, is32); } else { - emit_zext_32(ctx, t7, is32); + emit_zext_32(ctx, tm, is32); emit_zext_32(ctx, t2, is32); } - if (emit_cond_jmp(ctx, cond, t2, t7, jmp_offset) < 0) + if (emit_cond_jmp(ctx, cond, t2, tm, jmp_offset) < 0) goto toofar; break; @@ -775,10 +776,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext /* function call */ case BPF_JMP | BPF_CALL: - int ret; - u64 func_addr; - bool func_addr_fixed; - mark_call(ctx); ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass, &func_addr, &func_addr_fixed); @@ -811,8 +808,6 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, bool ext /* dst = imm64 */ case BPF_LD | BPF_IMM | BPF_DW: - u64 imm64 = (u64)(insn + 1)->imm << 32 | (u32)insn->imm; - move_imm(ctx, dst, imm64, is32); return 1; diff --git a/arch/parisc/include/asm/hardware.h b/arch/parisc/include/asm/hardware.h index 9d3d7737c58b..a005ebc54779 100644 --- a/arch/parisc/include/asm/hardware.h +++ b/arch/parisc/include/asm/hardware.h @@ -10,12 +10,12 @@ #define SVERSION_ANY_ID PA_SVERSION_ANY_ID struct hp_hardware { - unsigned short hw_type:5; /* HPHW_xxx */ - unsigned short hversion; - unsigned long sversion:28; - unsigned short opt; - const char name[80]; /* The hardware description */ -}; + unsigned int hw_type:8; /* HPHW_xxx */ + unsigned int hversion:12; + unsigned int sversion:12; + unsigned char opt; + unsigned char name[59]; /* The hardware description */ +} __packed; struct parisc_device; diff --git a/arch/parisc/include/uapi/asm/pdc.h b/arch/parisc/include/uapi/asm/pdc.h index e794e143ec5f..7a90070136e8 100644 --- a/arch/parisc/include/uapi/asm/pdc.h +++ b/arch/parisc/include/uapi/asm/pdc.h @@ -363,20 +363,25 @@ #if !defined(__ASSEMBLY__) -/* flags of the device_path */ +/* flags for hardware_path */ #define PF_AUTOBOOT 0x80 #define PF_AUTOSEARCH 0x40 #define PF_TIMER 0x0F -struct device_path { /* page 1-69 */ - unsigned char flags; /* flags see above! */ - unsigned char bc[6]; /* bus converter routing info */ - unsigned char mod; - unsigned int layers[6];/* device-specific layer-info */ -} __attribute__((aligned(8))) ; +struct hardware_path { + unsigned char flags; /* see bit definitions below */ + signed char bc[6]; /* Bus Converter routing info to a specific */ + /* I/O adaptor (< 0 means none, > 63 resvd) */ + signed char mod; /* fixed field of specified module */ +}; + +struct pdc_module_path { /* page 1-69 */ + struct hardware_path path; + unsigned int layers[6]; /* device-specific info (ctlr #, unit # ...) */ +} __attribute__((aligned(8))); struct pz_device { - struct device_path dp; /* see above */ + struct pdc_module_path dp; /* see above */ /* struct iomod *hpa; */ unsigned int hpa; /* HPA base address */ /* char *spa; */ @@ -611,21 +616,6 @@ struct pdc_initiator { /* PDC_INITIATOR */ int mode; }; -struct hardware_path { - char flags; /* see bit definitions below */ - char bc[6]; /* Bus Converter routing info to a specific */ - /* I/O adaptor (< 0 means none, > 63 resvd) */ - char mod; /* fixed field of specified module */ -}; - -/* - * Device path specifications used by PDC. - */ -struct pdc_module_path { - struct hardware_path path; - unsigned int layers[6]; /* device-specific info (ctlr #, unit # ...) */ -}; - /* Only used on some pre-PA2.0 boxes */ struct pdc_memory_map { /* PDC_MEMORY_MAP */ unsigned long hpa; /* mod's register set address */ diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index d126e78e101a..e7ee0c0c91d3 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -882,15 +882,13 @@ void __init walk_central_bus(void) &root); } -static void print_parisc_device(struct parisc_device *dev) +static __init void print_parisc_device(struct parisc_device *dev) { - char hw_path[64]; - static int count; + static int count __initdata; - print_pa_hwpath(dev, hw_path); - pr_info("%d. %s at %pap [%s] { %d, 0x%x, 0x%.3x, 0x%.5x }", - ++count, dev->name, &(dev->hpa.start), hw_path, dev->id.hw_type, - dev->id.hversion_rev, dev->id.hversion, dev->id.sversion); + pr_info("%d. %s at %pap { type:%d, hv:%#x, sv:%#x, rev:%#x }", + ++count, dev->name, &(dev->hpa.start), dev->id.hw_type, + dev->id.hversion, dev->id.sversion, dev->id.hversion_rev); if (dev->num_addrs) { int k; @@ -1079,7 +1077,7 @@ static __init int qemu_print_iodc_data(struct device *lin_dev, void *data) -static int print_one_device(struct device * dev, void * data) +static __init int print_one_device(struct device * dev, void * data) { struct parisc_device * pdev = to_parisc_device(dev); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 699df27b0e2f..2ca5418457ed 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -147,6 +147,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_SPLIT_ARG64 if PPC32 select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x @@ -285,7 +286,7 @@ config PPC # config PPC_LONG_DOUBLE_128 - depends on PPC64 + depends on PPC64 && ALTIVEC def_bool $(success,test "$(shell,echo __LONG_DOUBLE_128__ | $(CC) -E -P -)" = 1) config PPC_BARRIER_NOSPEC diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index a1142496cd58..6d51b007b59e 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -104,6 +104,13 @@ long sys_ppc_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, unsigned long len2); long sys_ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, size_t len, int advice); +long sys_ppc_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, + unsigned int offset2, + unsigned int nbytes1, + unsigned int nbytes2); +long sys_ppc_fallocate(int fd, int mode, u32 offset1, u32 offset2, + u32 len1, u32 len2); #endif #ifdef CONFIG_COMPAT long compat_sys_mmap2(unsigned long addr, size_t len, diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 930e36099015..2f68fb2ee4fc 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -813,6 +813,13 @@ kernel_dbg_exc: EXCEPTION_COMMON(0x260) CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception b interrupt_return diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..651c36b056bd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor addi r3,r1,STACK_FRAME_OVERHEAD - bl performance_monitor_exception + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl performance_monitor_exception_async b interrupt_return_srr +1: + bl performance_monitor_exception_nmi + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL /** * Interrupt 0xf20 - Vector Unavailable Interrupt. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index f9db0a172401..fc6631a80527 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -374,10 +374,18 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* - * CT_WARN_ON comes here via program_check_exception, - * so avoid recursion. + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM) + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 978a173eb339..a019ed6fc839 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) * Returning to soft-disabled context. * Check if a MUST_HARD_MASK interrupt has become pending, in which * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. */ ld r12,_MSR(r1) andi. r10,r12,MSR_EE beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() + - /* Must clear MSR_EE from _MSR */ +1: /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S li r10,0 /* Clear valid before changing _MSR */ diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 1ab4a4d95aba..d451a8229223 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -112,7 +112,7 @@ PPC32_SYSCALL_DEFINE6(ppc32_fadvise64, advice); } -COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, +PPC32_SYSCALL_DEFINE6(ppc_sync_file_range2, int, fd, unsigned int, flags, unsigned int, offset1, unsigned int, offset2, unsigned int, nbytes1, unsigned int, nbytes2) @@ -122,3 +122,14 @@ COMPAT_SYSCALL_DEFINE6(ppc_sync_file_range2, return ksys_sync_file_range(fd, offset, nbytes, flags); } + +#ifdef CONFIG_PPC32 +SYSCALL_DEFINE6(ppc_fallocate, + int, fd, int, mode, + u32, offset1, u32, offset2, u32, len1, u32, len2) +{ + return ksys_fallocate(fd, mode, + merge_64(offset1, offset2), + merge_64(len1, len2)); +} +#endif diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index e9e0df4f9a61..a0be127475b1 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -394,8 +394,11 @@ 305 common signalfd sys_signalfd compat_sys_signalfd 306 common timerfd_create sys_timerfd_create 307 common eventfd sys_eventfd -308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2 -309 nospu fallocate sys_fallocate compat_sys_fallocate +308 32 sync_file_range2 sys_ppc_sync_file_range2 compat_sys_ppc_sync_file_range2 +308 64 sync_file_range2 sys_sync_file_range2 +308 spu sync_file_range2 sys_sync_file_range2 +309 32 fallocate sys_ppc_fallocate compat_sys_fallocate +309 64 fallocate sys_fallocate 310 nospu subpage_prot sys_subpage_prot 311 32 timerfd_settime sys_timerfd_settime32 311 64 timerfd_settime sys_timerfd_settime diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 61cdd782d3c5..a9f57dad6d91 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on !CONTEXT_TRACKING_USER select KVM select KVM_BOOK3S_32_HANDLER select KVM_BOOK3S_PR_POSSIBLE @@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV config KVM_BOOK3S_64_PR tristate "KVM support without using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CONTEXT_TRACKING_USER select KVM_BOOK3S_PR_POSSIBLE help Support running guest kernels in virtual machines on processors @@ -190,6 +192,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" depends on PPC_E500 && !PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select MMU_NOTIFIER @@ -205,6 +208,7 @@ config KVM_E500V2 config KVM_E500MC bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select KVM_BOOKE_HV diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index f76a50291fd7..d491da8d1838 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -36,7 +36,17 @@ int exit_vmx_usercopy(void) { disable_kernel_altivec(); pagefault_enable(); - preempt_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -243,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -263,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -286,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -327,6 +360,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -339,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -382,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -445,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -463,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -476,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -493,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -517,10 +567,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +631,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +814,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 747492edb75a..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index df008edf7be0..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1981,7 +1981,7 @@ repeat: } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_SPINLOCK(linear_map_hash_lock); +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { @@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); if (!(linear_map_hash_slots[lmi] & 0x80)) { - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); return; } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 507dc0b5987d..63fd925ccbb8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -35,6 +35,7 @@ #include <asm/drmem.h> #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 0e0524cbe20c..4ad6e510d405 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -200,17 +200,42 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) struct vas_user_win_ref *tsk_ref; int rc; - rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); - if (!rc) { - tsk_ref = &txwin->vas_win.task_ref; - vas_dump_crb(&crb); - vas_update_csb(&crb, tsk_ref); + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); } return IRQ_HANDLED; } /* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + +/* * Allocate window and setup IRQ mapping. */ static int allocate_setup_window(struct pseries_vas_window *txwin, @@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin, goto out_irq; } - rc = request_threaded_irq(txwin->fault_virq, NULL, - pseries_vas_fault_thread_fn, IRQF_ONESHOT, + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, txwin->name, txwin); if (rc) { pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", @@ -826,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds) mutex_unlock(&vas_pseries_mutex); return rc; } + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + /* * Total number of default credits available (target_credits) * in LPAR depends on number of cores configured. It varies based on @@ -840,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int new_nr_creds, len, rc = 0; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -852,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, - vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (!rc) { - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, - new_nr_creds); - } - - if (rc) - pr_err("Failed reconfig VAS capabilities with DLPAR\n"); - - return rc; + return pseries_vas_dlpar_cpu(); } static struct notifier_block pseries_vas_nb = { diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 333ffa2f9f42..7115043ec488 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -132,6 +132,7 @@ struct pseries_vas_window { u64 flags; char *name; int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); @@ -140,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); #else static inline int vas_migration_handler(int action) { return 0; } +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} #endif #endif /* _VAS_H */ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6b48a3ae9843..fa78595a6089 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -411,14 +411,16 @@ config RISCV_ISA_SVPBMT If you don't know what to do here, say Y. -config CC_HAS_ZICBOM +config TOOLCHAIN_HAS_ZICBOM bool - default y if 64BIT && $(cc-option,-mabi=lp64 -march=rv64ima_zicbom) - default y if 32BIT && $(cc-option,-mabi=ilp32 -march=rv32ima_zicbom) + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zicbom) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zicbom) + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23800 config RISCV_ISA_ZICBOM bool "Zicbom extension support for non-coherent DMA operation" - depends on CC_HAS_ZICBOM + depends on TOOLCHAIN_HAS_ZICBOM depends on !XIP_KERNEL && MMU select RISCV_DMA_NONCOHERENT select RISCV_ALTERNATIVE @@ -433,6 +435,13 @@ config RISCV_ISA_ZICBOM If you don't know what to do here, say Y. +config TOOLCHAIN_HAS_ZIHINTPAUSE + bool + default y + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zihintpause) + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zihintpause) + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23600 + config FPU bool "FPU support" default y diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 1c8ec656e916..0d13b597cb55 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -59,12 +59,10 @@ toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zi riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei # Check if the toolchain supports Zicbom extension -toolchain-supports-zicbom := $(call cc-option-yn, -march=$(riscv-march-y)_zicbom) -riscv-march-$(toolchain-supports-zicbom) := $(riscv-march-y)_zicbom +riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZICBOM) := $(riscv-march-y)_zicbom # Check if the toolchain supports Zihintpause extension -toolchain-supports-zihintpause := $(call cc-option-yn, -march=$(riscv-march-y)_zihintpause) -riscv-march-$(toolchain-supports-zihintpause) := $(riscv-march-y)_zihintpause +riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE) := $(riscv-march-y)_zihintpause KBUILD_CFLAGS += -march=$(subst fd,,$(riscv-march-y)) KBUILD_AFLAGS += -march=$(riscv-march-y) diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h index 38af2ec7b9bf..6d58bbb5da46 100644 --- a/arch/riscv/include/asm/jump_label.h +++ b/arch/riscv/include/asm/jump_label.h @@ -14,8 +14,8 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct static_key *key, - bool branch) +static __always_inline bool arch_static_branch(struct static_key * const key, + const bool branch) { asm_volatile_goto( " .option push \n\t" @@ -35,8 +35,8 @@ label: return true; } -static __always_inline bool arch_static_branch_jump(struct static_key *key, - bool branch) +static __always_inline bool arch_static_branch_jump(struct static_key * const key, + const bool branch) { asm_volatile_goto( " .option push \n\t" diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index 1e4f8b4aef79..fa70cfe507aa 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -21,7 +21,7 @@ static inline void cpu_relax(void) * Reduce instruction retirement. * This assumes the PC changes. */ -#ifdef __riscv_zihintpause +#ifdef CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE __asm__ __volatile__ ("pause"); #else /* Encoding of the pause instruction */ diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index fa427bdcf773..852ecccd8920 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -213,6 +213,9 @@ static void print_mmu(struct seq_file *f) static void *c_start(struct seq_file *m, loff_t *pos) { + if (*pos == nr_cpu_ids) + return NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return (void *)(uintptr_t)(1 + *pos); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index a22e418dbd82..e1226709490f 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -113,6 +113,8 @@ static void __init kasan_populate_pud(pgd_t *pgd, base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); } else if (pgd_none(*pgd)) { base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + memcpy(base_pud, (void *)kasan_early_shadow_pud, + sizeof(pud_t) * PTRS_PER_PUD); } else { base_pud = (pud_t *)pgd_page_vaddr(*pgd); if (base_pud == lm_alias(kasan_early_shadow_pud)) { @@ -173,8 +175,11 @@ static void __init kasan_populate_p4d(pgd_t *pgd, base_p4d = pt_ops.get_p4d_virt(pfn_to_phys(_pgd_pfn(*pgd))); } else { base_p4d = (p4d_t *)pgd_page_vaddr(*pgd); - if (base_p4d == lm_alias(kasan_early_shadow_p4d)) + if (base_p4d == lm_alias(kasan_early_shadow_p4d)) { base_p4d = memblock_alloc(PTRS_PER_PUD * sizeof(p4d_t), PAGE_SIZE); + memcpy(base_p4d, (void *)kasan_early_shadow_p4d, + sizeof(p4d_t) * PTRS_PER_P4D); + } } p4dp = base_p4d + p4d_index(vaddr); diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S index af5c6860e0a1..fa9d33b01b85 100644 --- a/arch/s390/boot/vmlinux.lds.S +++ b/arch/s390/boot/vmlinux.lds.S @@ -102,8 +102,17 @@ SECTIONS _compressed_start = .; *(.vmlinux.bin.compressed) _compressed_end = .; - FILL(0xff); - . = ALIGN(4096); + } + +#define SB_TRAILER_SIZE 32 + /* Trailer needed for Secure Boot */ + . += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */ + . = ALIGN(4096) - SB_TRAILER_SIZE; + .sb.trailer : { + QUAD(0) + QUAD(0) + QUAD(0) + QUAD(0x000000207a49504c) } _end = .; diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index e08c882dccaa..eaeaeb3ff0be 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -17,7 +17,8 @@ "3: jl 1b\n" \ " lhi %0,0\n" \ "4: sacf 768\n" \ - EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ + EX_TABLE(0b,4b) EX_TABLE(1b,4b) \ + EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ "=m" (*uaddr) \ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c index d5c7c1e30c17..74b53c531e0c 100644 --- a/arch/s390/kernel/perf_pai_ext.c +++ b/arch/s390/kernel/perf_pai_ext.c @@ -459,6 +459,7 @@ static int paiext_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 58033dfcb6d4..720036fb1924 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -157,7 +157,7 @@ unsigned long __clear_user(void __user *to, unsigned long size) asm volatile( " lr 0,%[spec]\n" "0: mvcos 0(%1),0(%4),%0\n" - " jz 4f\n" + "6: jz 4f\n" "1: algr %0,%2\n" " slgr %1,%2\n" " j 0b\n" @@ -167,11 +167,11 @@ unsigned long __clear_user(void __user *to, unsigned long size) " clgr %0,%3\n" /* copy crosses next page boundary? */ " jnh 5f\n" "3: mvcos 0(%1),0(%4),%3\n" - " slgr %0,%3\n" + "7: slgr %0,%3\n" " j 5f\n" "4: slgr %0,%0\n" "5:\n" - EX_TABLE(0b,2b) EX_TABLE(3b,5b) + EX_TABLE(0b,2b) EX_TABLE(6b,2b) EX_TABLE(3b,5b) EX_TABLE(7b,5b) : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c index 080c88620723..588089332931 100644 --- a/arch/s390/pci/pci_mmio.c +++ b/arch/s390/pci/pci_mmio.c @@ -64,7 +64,7 @@ static inline int __pcistg_mio_inuser( asm volatile ( " sacf 256\n" "0: llgc %[tmp],0(%[src])\n" - " sllg %[val],%[val],8\n" + "4: sllg %[val],%[val],8\n" " aghi %[src],1\n" " ogr %[val],%[tmp]\n" " brctg %[cnt],0b\n" @@ -72,7 +72,7 @@ static inline int __pcistg_mio_inuser( "2: ipm %[cc]\n" " srl %[cc],28\n" "3: sacf 768\n" - EX_TABLE(0b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) + EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b) : [src] "+a" (src), [cnt] "+d" (cnt), [val] "+d" (val), [tmp] "=d" (tmp), @@ -215,10 +215,10 @@ static inline int __pcilg_mio_inuser( "2: ahi %[shift],-8\n" " srlg %[tmp],%[val],0(%[shift])\n" "3: stc %[tmp],0(%[dst])\n" - " aghi %[dst],1\n" + "5: aghi %[dst],1\n" " brctg %[cnt],2b\n" "4: sacf 768\n" - EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) + EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b) : [ioaddr_len] "+&d" (ioaddr_len.pair), [cc] "+d" (cc), [val] "=d" (val), diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index b7664d018851..8fa58b0f3cb3 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -27,13 +27,17 @@ #include <asm/cpu_device_id.h> #include <asm/simd.h> +#define POLYVAL_ALIGN 16 +#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) +#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1)) +#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA) #define NUM_KEY_POWERS 8 struct polyval_tfm_ctx { /* * These powers must be in the order h^8, ..., h^1. */ - u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE]; + u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR; }; struct polyval_desc_ctx { @@ -45,6 +49,11 @@ asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator); asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2); +static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) +{ + return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN); +} + static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { @@ -72,7 +81,7 @@ static void internal_polyval_mul(u8 *op1, const u8 *op2) static int polyval_x86_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm); + struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm); int i; if (keylen != POLYVAL_BLOCK_SIZE) @@ -102,7 +111,7 @@ static int polyval_x86_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); u8 *pos; unsigned int nblocks; unsigned int n; @@ -143,7 +152,7 @@ static int polyval_x86_update(struct shash_desc *desc, static int polyval_x86_final(struct shash_desc *desc, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); - const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); if (dctx->bytes) { internal_polyval_mul(dctx->buffer, @@ -167,7 +176,7 @@ static struct shash_alg polyval_alg = { .cra_driver_name = "polyval-clmulni", .cra_priority = 200, .cra_blocksize = POLYVAL_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct polyval_tfm_ctx), + .cra_ctxsize = POLYVAL_CTX_SIZE, .cra_module = THIS_MODULE, }, }; diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 3271735f0070..4cb710efbdd9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -801,7 +801,7 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, /* Extension Memory */ if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { - data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL; if (op_data2->rmt_node) { data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; /* IBS doesn't provide Remote socket detail */ diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 77e3a47af5ad..fea544e5842a 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -806,7 +806,11 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include <linux/kmsan_string.h> +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bc614cfe21b..1cc756eafa44 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -254,24 +254,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ - __chk_user_ptr(ptr); \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(__x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(__x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(__x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(__x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ - instrument_put_user(__x, ptr, size); \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7065462378e2..0810e93cbedc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1133,11 +1133,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = max(entry->eax, 0x80000021); break; case 0x80000001: + entry->ebx &= ~GENMASK(27, 16); cpuid_entry_override(entry, CPUID_8000_0001_EDX); cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000006: - /* L2 cache and TLB: pass through host info. */ + /* Drop reserved bits, pass host L2 cache and TLB info. */ + entry->edx &= ~GENMASK(17, 16); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -1167,6 +1169,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); + entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; @@ -1186,6 +1189,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ecx = entry->edx = 0; break; case 0x8000001a: + entry->eax &= GENMASK(2, 0); + entry->ebx = entry->ecx = entry->edx = 0; + break; case 0x8000001e: break; case 0x8000001F: @@ -1193,7 +1199,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = entry->edx = 0; } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); - + /* Clear NumVMPL since KVM does not support VMPL. */ + entry->ebx &= ~GENMASK(31, 12); /* * Enumerate '0' for "PA bits reduction", the adjusted * MAXPHYADDR is enumerated directly (see 0x80000008). diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index cfed36aba2f7..c1390357126a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -158,11 +158,16 @@ out: static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) { struct kvm *kvm = inode->i_private; + int r; if (!kvm_get_kvm_safe(kvm)) return -ENOENT; - return single_open(file, kvm_mmu_rmaps_stat_show, kvm); + r = single_open(file, kvm_mmu_rmaps_stat_show, kvm); + if (r < 0) + kvm_put_kvm(kvm); + + return r; } static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3b27622d4642..4a43261d25a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -791,8 +791,7 @@ static int linearize(struct x86_emulate_ctxt *ctxt, ctxt->mode, linear); } -static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, - enum x86emul_mode mode) +static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst) { ulong linear; int rc; @@ -802,41 +801,71 @@ static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst, if (ctxt->op_bytes != sizeof(unsigned long)) addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1); - rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear); + rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear); if (rc == X86EMUL_CONTINUE) ctxt->_eip = addr.ea; return rc; } +static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + struct desc_struct cs; + u16 selector; + u32 base3; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) { + /* Real mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_REAL; + return X86EMUL_CONTINUE; + } + + if (ctxt->eflags & X86_EFLAGS_VM) { + /* Protected/VM86 mode. cpu must not have long mode active */ + if (efer & EFER_LMA) + return X86EMUL_UNHANDLEABLE; + ctxt->mode = X86EMUL_MODE_VM86; + return X86EMUL_CONTINUE; + } + + if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS)) + return X86EMUL_UNHANDLEABLE; + + if (efer & EFER_LMA) { + if (cs.l) { + /* Proper long mode */ + ctxt->mode = X86EMUL_MODE_PROT64; + } else if (cs.d) { + /* 32 bit compatibility mode*/ + ctxt->mode = X86EMUL_MODE_PROT32; + } else { + ctxt->mode = X86EMUL_MODE_PROT16; + } + } else { + /* Legacy 32 bit / 16 bit mode */ + ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + } + + return X86EMUL_CONTINUE; +} + static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) { - return assign_eip(ctxt, dst, ctxt->mode); + return assign_eip(ctxt, dst); } -static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, - const struct desc_struct *cs_desc) +static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst) { - enum x86emul_mode mode = ctxt->mode; - int rc; + int rc = emulator_recalc_and_set_mode(ctxt); -#ifdef CONFIG_X86_64 - if (ctxt->mode >= X86EMUL_MODE_PROT16) { - if (cs_desc->l) { - u64 efer = 0; + if (rc != X86EMUL_CONTINUE) + return rc; - ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); - if (efer & EFER_LMA) - mode = X86EMUL_MODE_PROT64; - } else - mode = X86EMUL_MODE_PROT32; /* temporary value */ - } -#endif - if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32) - mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - rc = assign_eip(ctxt, dst, mode); - if (rc == X86EMUL_CONTINUE) - ctxt->mode = mode; - return rc; + return assign_eip(ctxt, dst); } static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -2172,7 +2201,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2250,7 +2279,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, eip, &new_desc); + rc = assign_eip_far(ctxt, eip); /* Error handling is not implemented. */ if (rc != X86EMUL_CONTINUE) return X86EMUL_UNHANDLEABLE; @@ -2432,7 +2461,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 8; i++) *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); @@ -2489,7 +2518,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u16 selector; int i, r; - for (i = 0; i < NR_EMULATOR_GPRS; i++) + for (i = 0; i < 16; i++) *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); @@ -2633,7 +2662,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * those side effects need to be explicitly handled for both success * and shutdown. */ - return X86EMUL_CONTINUE; + return emulator_recalc_and_set_mode(ctxt); emulate_shutdown: ctxt->ops->triple_fault(ctxt); @@ -2876,6 +2905,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); ctxt->_eip = rdx; + ctxt->mode = usermode; *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; @@ -3469,7 +3499,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; - rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc); + rc = assign_eip_far(ctxt, ctxt->src.val); if (rc != X86EMUL_CONTINUE) goto fail; @@ -3611,11 +3641,25 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt) static int em_cr_write(struct x86_emulate_ctxt *ctxt) { - if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + int cr_num = ctxt->modrm_reg; + int r; + + if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val)) return emulate_gp(ctxt, 0); /* Disable writeback. */ ctxt->dst.type = OP_NONE; + + if (cr_num == 0) { + /* + * CR0 write might have updated CR0.PE and/or CR0.PG + * which can affect the cpu's execution mode. + */ + r = emulator_recalc_and_set_mode(ctxt); + if (r != X86EMUL_CONTINUE) + return r; + } + return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9dba04b6b019..65f092e4a81b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8263,6 +8263,11 @@ static __init int hardware_setup(void) if (!cpu_has_virtual_nmis()) enable_vnmi = 0; +#ifdef CONFIG_X86_SGX_KVM + if (!cpu_has_vmx_encls_vmexit()) + enable_sgx = false; +#endif + /* * set_apic_access_page_addr() is used to reload apic access * page upon invalidation. No need to do anything if not diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cf1ba865562..521b433f978c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2315,11 +2315,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, /* we verify if the enable bit is set... */ if (system_time & 1) { - kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu, - KVM_HOST_USES_PFN, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info)); + kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); } else { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); } return; @@ -3388,7 +3388,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } @@ -10044,7 +10044,20 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, kvm_x86_ops.nested_ops->has_events(vcpu)) *req_immediate_exit = true; - WARN_ON(kvm_is_exception_pending(vcpu)); + /* + * KVM must never queue a new exception while injecting an event; KVM + * is done emulating and should only propagate the to-be-injected event + * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an + * infinite loop as KVM will bail from VM-Enter to inject the pending + * exception and start the cycle all over. + * + * Exempt triple faults as they have special handling and won't put the + * vCPU into an infinite loop. Triple fault can be queued when running + * VMX without unrestricted guest, as that requires KVM to emulate Real + * Mode events (see kvm_inject_realmode_interrupt()). + */ + WARN_ON_ONCE(vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending); return 0; out: @@ -11816,6 +11829,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + kvm_gpc_init(&vcpu->arch.pv_time); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 93c628d3e3a9..2dae413bd62a 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -42,13 +42,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int idx = srcu_read_lock(&kvm->srcu); if (gfn == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(kvm, gpc); + kvm_gpc_deactivate(kvm, gpc); goto out; } do { - ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, KVM_HOST_USES_PFN, - gpa, PAGE_SIZE); + ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa, + PAGE_SIZE); if (ret) goto out; @@ -554,15 +554,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) offsetof(struct compat_vcpu_info, time)); if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_info_cache, NULL, + KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -570,16 +570,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct pvclock_vcpu_time_info)); + r = kvm_gpc_activate(vcpu->kvm, + &vcpu->arch.xen.vcpu_time_info_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct pvclock_vcpu_time_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); break; @@ -590,16 +590,15 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } if (data->u.gpa == GPA_INVALID) { - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, + &vcpu->arch.xen.runstate_cache); r = 0; break; } - r = kvm_gfn_to_pfn_cache_init(vcpu->kvm, - &vcpu->arch.xen.runstate_cache, - NULL, KVM_HOST_USES_PFN, data->u.gpa, - sizeof(struct vcpu_runstate_info)); + r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache, + NULL, KVM_HOST_USES_PFN, data->u.gpa, + sizeof(struct vcpu_runstate_info)); break; case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: @@ -1667,18 +1666,18 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, case EVTCHNSTAT_ipi: /* IPI must map back to the same port# */ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ break; case EVTCHNSTAT_interdomain: if (data->u.evtchn.deliver.port.port) { if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) - goto out; /* -EINVAL */ + goto out_noeventfd; /* -EINVAL */ } else { eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); - goto out; + goto out_noeventfd; } } break; @@ -1718,6 +1717,7 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, out: if (eventfd) eventfd_ctx_put(eventfd); +out_noeventfd: kfree(evtchnfd); return ret; } @@ -1816,7 +1816,12 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) { vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; vcpu->arch.xen.poll_evtchn = 0; + timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); + + kvm_gpc_init(&vcpu->arch.xen.runstate_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -1824,18 +1829,17 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) if (kvm_xen_timer_enabled(vcpu)) kvm_xen_stop_timer(vcpu); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.runstate_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_info_cache); - kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, - &vcpu->arch.xen.vcpu_time_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache); + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache); + del_timer_sync(&vcpu->arch.xen.poll_timer); } void kvm_xen_init_vm(struct kvm *kvm) { idr_init(&kvm->arch.xen.evtchn_ports); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache); } void kvm_xen_destroy_vm(struct kvm *kvm) @@ -1843,7 +1847,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm) struct evtchnfd *evtchnfd; int i; - kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache); + kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache); idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { if (!evtchnfd->deliver.port.port) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 58a200dc762d..17f09dc26381 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,6 +26,7 @@ GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that |